library(RJSONIO)
library(XML)
library(knitr)
library(ggplot2)
library(scales)
library(sets)

od500.csv <- function(filename){
  read.csv(
    file.path('download', filename),
    stringsAsFactors = FALSE)
}

od500.json <- function(filename){
  fromJSON(file.path('download', filename))
}

od500.html <- function(filename) {
  htmlParse(file.path('www.opendata500.com',filename,'index.html'))
}

read.datasets <- function() {
  datasets <- od500.json('OD500_Datasets.json')
  datasets[154][[1]]$usedByCompany <- NA
  datasets[155][[1]]$usedByCompany <- NA
  datasets.mat <- sapply(datasets, function(d) {data.frame(d, stringsAsFactors = FALSE)})
  datasets.df.1 <- data.frame(t(datasets.mat), stringsAsFactors = FALSE)
  datasets.df.2 <- data.frame(lapply(datasets.df.1, unlist), stringsAsFactors = FALSE)
  datasets.df.2
}

read.candidates <- function() {
  x <- od500.json('OD500_Companies.json')
  ids <- data.frame(
    companyName = sapply(x, function(y){y$companyName}),
    companyID = sapply(x, function(y){y$companyID})
  )
  # This doesn't work because the unique identifier column is not unique.
  merge(ids, od500.csv('500_Companies.csv'), by.x = 'CompanyName', by.y = 'CompanyName')
}

pretty.levels <- function(x, y) { UseMethod('pretty.levels')}
pretty.levels.character <- function(vec) {
  paste(sort(unique(vec)), collapse = ', ')
}
pretty.levels.list <- function(thelist, thefield) {
  paste(sort(unique(c(sapply(thelist, function(x){x[[thefield]]})))), collapse = ', ')
}

datasets.json <- read.datasets()
preview.csv <- od500.csv('Preview50_Companies.csv')
preview.json <- od500.json('OD500_Companies.json')
candidates.csv <- od500.csv('500_Companies.csv')
preview.xpath <- '//ul[@class="m-preview-list"]/li[@class="m-list-company"]'
preview.html <- xpathApply(od500.html('preview'), preview.xpath)

candidates.xpath <- '//div[@class="m-candidates isotopes-container"]/div'
candidates.html.nodes <- xpathApply(od500.html('candidates'), candidates.xpath)
candidates.html.field <- function(xpath, func = identity){
  x <- lapply(candidates.html.nodes, function(e){xpathApply(e, xpath)[[1]]})
  sapply(x, func)
}

candidates.parent.xpath <- '//div[@class="m-candidates isotopes-container"]'
candidates.parent.html <- xpathApply(od500.html('candidates'), candidates.parent.xpath)[[1]]

preview.company.xpath <- 'contains(@class, "preview-company")'
survey.company.xpath <- 'contains(@class, "survey-company")'

candidates.html <- data.frame(
  name = candidates.html.field('a/h3/strong/text()', xmlValue),
  city = candidates.html.field('p[@class="m-homepage-list-location"]/text()', xmlValue),
  func = candidates.html.field('em/text()', xmlValue),
  desc = candidates.html.field('p[@class="m-homepage-list-desc"]/text()', xmlValue),
  preview.company = candidates.html.field(preview.company.xpath),
  survey.company = candidates.html.field(survey.company.xpath),
  href = candidates.html.field('a/@href')
)

preview.companies.candidates.html <- as.character((subset(candidates.html, preview.company)$name))
preview.companies.preview.html <- sapply(preview.html,
  function(x){xmlValue(xpathApply(x, 'descendant::div[@class="m-preview-list-name"]/strong/text()')[[1]])})
preview.companies.csv <- unique(preview.csv$CompanyName)
preview.companies.json <- sapply(preview.json, function(x){x$companyName})

d <- cset(preview.companies.candidates.html, preview.companies.csv)

p <- (function(){
  candidates.html$preview.company <-
    factor(candidates.html$preview.company, levels = c(TRUE,FALSE))
  levels(candidates.html$preview.company) <- c('Yes','No')

  candidates.html$survey.company <-
    factor(candidates.html$survey.company, levels = c(TRUE,FALSE))
  levels(candidates.html$survey.company) <- c('Yes','No')

  ggplot(candidates.html) + aes(x = survey.company, fill = preview.company) +
    scale_fill_discrete('Is the company listed as a preview company\nin the preview HTML file') +
    scale_x_discrete('Did the company complete the questionnaire?') +
    scale_y_continuous('Number of companies') +
    theme(legend.position = 'bottom') +
    geom_bar() +
    ggtitle('Are the preview companies simply the companies who have responded to the questionnaire?')
})()

exports <- list(
  datasets.json = datasets.json,
  preview.csv = preview.csv,
  preview.html = preview.html,
  preview.json = preview.json,
  candidates.csv = candidates.csv,
  candidates.html = candidates.html
)

# knit('dictionary.Rmd')


# library(devtools)
# install_github("RODProt", "QBRC")

# library(RODProt)
# read_data_package('./datapackage.json')
# open.data.500 <- read_data_package('./datapackage.json')
# companies <- get_resource(open.data.500, 'companies', overlook.types = TRUE)
# datasets <- get_resource(open.data.500, 'datasets', overlook.types = TRUE)

# companies <- read.csv('companies.csv', stringsAsFactors = FALSE)
# datasets <- read.csv('datasets.csv', stringsAsFactors = FALSE)
companies$n.datasets <- nchar(gsub('[^\n]','', companies$datasets)) - 1

datasets$dataset.hostname <- sub('^(?:http://|ftp://|https://)?([^/]*)/?.*$', '\\1', datasets$dataset.url)
url.is.na <- !grepl('\\.', datasets$dataset.url)
datasets$dataset.hostname[url.is.na] <- NA
datasets$dataset.url[url.is.na] <- NA

datasets$company.hostname <- sub('^(?:http://|ftp://|https://)?([^/]*)/?.*$', '\\1', datasets$company.url)

library(plyr)
companies.with.hostnames <- ddply(datasets, 'company.href', function(datasets){
  companies <- datasets[1,]
  companies$unique.dataset.hostnames = length(unique(datasets$dataset.hostname))
  companies$datasets = length((datasets$dataset.hostname))
  companies
})
p.hostnames <- ggplot(companies.with.hostnames) +
  aes(x = unique.dataset.hostnames) +
  geom_histogram() +
  xlab('Number of different website hostnames') +
  ylab('Number of companies')

df <- ddply(companies.with.hostnames, c('datasets','unique.dataset.hostnames'), function(df) { c(companies = nrow(df)) })
p.hostnames.datasets <- ggplot(df) +
  aes(x = datasets, y = unique.dataset.hostnames, size = companies) +
  geom_point(size = 10)

p.fte.datasets <- ggplot(companies) +
  aes(y = n.datasets, x = fte, label = company.name) +
  scale_x_continuous('Number of full-time employees', labels = comma) +
  scale_y_continuous('Number datasets reported', labels = comma) +
# geom_point(size = 10)
  ggtitle('Larger companies don\'t report more datasets.') +
  geom_text()

p.priorities <- ggplot(companies) +
  aes(x = nchar(social.impact), y = nchar(financial.info)) +
  geom_point() + coord_fixed() + geom_abline(slope = 1) +
  ggtitle('How much they write about social and financial things')

library(sqldf)
non.gov <- sqldf('SELECT company_name, count(*) FROM datasets WHERE dataset_url NOT LIKE \'%.gov%\' group by company_href order by 2')
own.data <- subset(datasets, company.hostname == dataset.hostname)[c('company.name','dataset.name')]

# subset(companies, data.collection == 'questionnaire')$location