#Web Scraping part 2: Diggint in #Rolf Fredheim and Aiora Zabala #University of Cambridge #25/02/2014 #=========================================================================== #Variables================================== uni <- "The University of Cambridge" uni #Paying tax: #9400 tax free (20000-9440)*20/100 #OR: wage <- 20000 taxFree <- 9400 rate <- 20 (wage-taxFree)*rate/100 #Functions without variables================ printName <- function(){ print ("My name is Rolf Fredheim") } printName() #e.g. for simulations======================= sillySimulation <- function(){ x1 <- runif(500,80,100) x2 <- runif(500,0,100) v1 <- c(x1,x2) x3 <- runif(1000,0,100) df <- data.frame(v1,x3) require(ggplot2) print(ggplot(df, aes(v1,x3))+geom_point()+ggtitle("simulation of some sort")) } #Inserting variables======================= desperateTimes <- function(){ print(paste0("Rolf is struggling to finish his PhD on time. Time remaining: 6 months")) } #Name===================================== desperateTimes <- function(name){ print(paste0(name ," is struggling to finish his PhD on time. Time remaining: 6 months")) } desperateTimes(name="Tom") #Gender=================================== desperateTimes <- function(name,gender="m"){ if(gender=="m"){ pronoun="his" }else{ pronoun="her" } print(paste0(name ," is struggling to finish ",pronoun," PhD on time. Time remaining: 6 months")) } desperateTimes(name="Tanya",gender="f") #degree==================================== desperateTimes <- function(name,gender="m",degree){ if(gender=="m"){ pronoun="his" }else{ pronoun="her" } print(paste0(name ," is struggling to finish ",pronoun," ",degree," on time. Time remaining: 6 months")) } desperateTimes(name="Rolf",gender="m","Mphil") #Days til deadline========================= require(lubridate) require(ggplot2) deadline <- as.Date("2014-09-01") daysLeft <- deadline-Sys.Date() totDays <- deadline-as.Date("2011-10-01") print(daysLeft) print(paste0("Rolf is struggling to finish his PhD on time. Days remaining: ", as.numeric(daysLeft))) #part2===================================== print(paste0("Percentage to go: ",round(as.numeric(daysLeft)/as.numeric(totDays)*100))) df <- data.frame(days=c(daysLeft,totDays-daysLeft),lab=c("to go","completed")) ggplot(df,aes(1,days,fill=lab))+geom_bar(stat="identity",position="fill") #========================================== timeToWorry <- function(){ require(lubridate) deadline <- as.Date("2014-09-01") daysLeft <- deadline-Sys.Date() totDays <- deadline-as.Date("2011-10-01") print(daysLeft) print(paste0("Rolf is struggling to finish his PhD on time. Days remaining: ", as.numeric(daysLeft))) print(paste0("Percentage to go: ",round(as.numeric(daysLeft)/as.numeric(totDays)*100))) df <- data.frame(days=c(daysLeft,totDays-daysLeft),lab=c("to go","completed")) ggplot(df,aes(1,days,fill=lab))+geom_bar(stat="identity",position="fill") } #Last week's code========================== require(rjson) url <- "http://stats.grok.se/json/en/201201/web_scraping" raw.data <- readLines(url, warn="F") rd <- fromJSON(raw.data) rd.views <- rd$daily_views rd.views <- unlist(rd.views) rd <- as.data.frame(rd.views) rd$date <- rownames(rd) rownames(rd) <- NULL rd #Turn it into a function=================== getData <- function(url){ require(rjson) raw.data <- readLines(url, warn="F") rd <- fromJSON(raw.data) rd.views <- rd$daily_views rd.views <- unlist(rd.views) rd <- as.data.frame(rd.views) rd$date <- rownames(rd) rownames(rd) <- NULL rd$date <- as.Date(rd$date) return(rd) } getData("http://stats.grok.se/json/en/201201/web_scraping") #Creating the URLS========================= getUrls <- function(y1,y2,term){ root <- "http://stats.grok.se/json/en/" urls <- NULL for (year in y1:y2){ for (month in 1:9){ urls <- c(urls,(paste(root,year,0,month,"/",term,sep=""))) } for (month in 10:12){ urls <- c(urls,(paste(root,year,month,"/",term,sep=""))) } } return(urls) } #Put it together=========================== #create some URLS urls <- getUrls(y1=2013,y2=2014,"Euromaidan") #get data for each of them and store that data results=NULL for (url in urls){ results <- rbind(results,getData(url)) } head(results) ggplot(tail(results,100),aes(date,rd.views))+geom_line() #Download the web page===================== require(RCurl) require(XML) url <- "http://en.wikipedia.org/wiki/Euromaidan" SOURCE <- getURL(url,encoding="UTF-8") #Download the page #this is a very very long line. Let's not print it. Instead: substring (SOURCE,1,200) PARSED <- htmlParse(SOURCE) #Format the html code d #Accessing HTML elements in R============== xpathSApply(PARSED, "//h1") xpathSApply(PARSED, "//h1",xmlValue) #Digging deeper============================ xpathSApply(PARSED, "//h3",xmlValue) ========= length(xpathSApply(PARSED, "//a/@href")) #Get references============================ head(xpathSApply(PARSED, "//span[@class='citation news']",xmlValue)) head(xpathSApply(PARSED, "//span[@class='citation news']/a/@href")) links <- (xpathSApply(PARSED, "//span[@class='citation news']/a/@href")) browseURL(links[1]) #XPath2==================================== (xpathSApply(PARSED, "//*[@class='citation news'][17]/a/@*")) (xpathSApply(PARSED, "//span[@class='citation news'][17]/a/@*")) #XPath3==================================== head(xpathSApply(PARSED, "//span[starts-with(@class,'citation')][17]/a/@href")) head(xpathSApply(PARSED, "//span[contains(@class,'citation')][17]/a/@href")) #Example - BBC article===================== url <- "http://www.bbc.co.uk/news/world-europe-26333587" SOURCE <- getURL(url,encoding="UTF-8") # Specify encoding when dealing with non-latin characters PARSED <- htmlParse(SOURCE) (xpathSApply(PARSED, "//h1[@class='story-header']",xmlValue)) (xpathSApply(PARSED, "//span[@class='date']",xmlValue)) #Meta field for better formatting (xpathSApply(PARSED, "//meta[@name='OriginalPublicationDate']/@content")) #Make a scraper============================ bbcScraper <- function(url){ SOURCE <- getURL(url,encoding="UTF-8") PARSED <- htmlParse(SOURCE) title <- (xpathSApply(PARSED, "//h1[@class='story-header']",xmlValue)) date <- as.character(xpathSApply(PARSED, "//meta[@name='OriginalPublicationDate']/@content")) return(c(title,date)) } #test It=================================== bbcScraper("http://www.bbc.co.uk/news/world-middle-east-26333533") bbcScraper("http://www.bbc.co.uk/sport/0/football/26332893") #Adding exceptions========================= bbcScraper2 <- function(url){ title <- date=NA #Return empty values in case field not found SOURCE <- getURL(url,encoding="UTF-8") PARSED <- htmlParse(SOURCE) title=(xpathSApply(PARSED, "//h1[@class='story-header']",xmlValue)) date <- (xpathSApply(PARSED, "//meta[@name='OriginalPublicationDate']/@content")) if (is.null(date)){ date=(xpathSApply(PARSED, "//span[@class='date']",xmlValue)) } return(c(title,as.character(date))) } bbcScraper2("http://www.bbc.co.uk/news/world-middle-east-26333533") bbcScraper2("http://www.bbc.co.uk/sport/0/football/26332893") #Guardian=================================== url <- "http://www.theguardian.com/commentisfree/2014/feb/25/how-much-cost-growers-bananas-68p-per-kilo" SOURCE <- getURL(url,encoding="UTF-8") PARSED <- htmlParse(SOURCE) xpathSApply(PARSED, "//h1[contains(@itemprop,'headline')]",xmlValue) xpathSApply(PARSED, "//a[@class='contributor']",xmlValue) xpathSApply(PARSED, "//time[@itemprop='datePublished']",xmlValue) xpathSApply(PARSED, "//time[@itemprop='datePublished']/@datetime") xpathSApply(PARSED, "//a[@rel='tag']",xmlValue) unique(xpathSApply(PARSED, "//a[@rel='tag']",xmlValue)) xpathSApply(PARSED, "//div[@id='article-body-blocks']",xmlValue) xpathSApply(PARSED, "//div[@id='article-body-blocks']/p",xmlValue) #Guardian scraper============================ guardianScraper <- function(url){ SOURCE <- getURL(url,encoding="UTF-8") # Specify encoding when dealing with non-latin characters PARSED <- htmlParse(SOURCE) title <- xpathSApply(PARSED, "//h1[contains(@itemprop,'headline')]",xmlValue) author <- xpathSApply(PARSED, "//a[@class='contributor']",xmlValue) time <- xpathSApply(PARSED, "//time[@itemprop='datePublished']/@datetime") tags <- unique(xpathSApply(PARSED, "//a[@rel='tag']",xmlValue)) text <- xpathSApply(PARSED, "//div[@id='article-body-blocks']/p",xmlValue) return(list(title=title, author=author, time=time, tags=paste(tags,collapse="|") ,text=paste(text,collapse="|"))) } #Using the scraper============================ a <- guardianScraper(url) a["title"] a[["title"]] a[["tags"]] #Example with dataframe. ===================== url <- "http://www.theguardian.com/uk" SOURCE <- getURL(url,encoding="UTF-8") PARSED <- htmlParse(SOURCE) urls <- xpathSApply(PARSED, "//div[@class='tabs-container']//*/@href") #This is a bit tricky, apologies. There may be a better way d <- lapply(urls,guardianScraper) data <- data.frame(matrix(unlist(d),ncol=5,byrow=T)) colnames(data) <- c("title","author","time","tags","text") as.character(data$tags) #Accessing this data later on================= require(stringr) #return title of texts mentioning Chelsea data[grep("Chelsea",data$tags),"title"] #return tags of texts mentioning Chelsea unlist(str_split(data[grep("Chelsea",data$tags),"tags"],"\\|")) #Advanced stuff: comments===================== url <- "http://discussion.theguardian.com/discussion/p/3n34d" SOURCE <- getURL(url,encoding="UTF-8") # Specify encoding when dealing with non-latin characters PARSED <- htmlParse(SOURCE) xpathSApply(PARSED, "//div[@class='d2-body'][1]",xmlValue) xpathSApply(PARSED, "//a[@class='d2-username']",xmlValue) #Comments2==================================== url <- "http://www.theguardian.com/commentisfree/2014/feb/25/how-much-cost-growers-bananas-68p-per-kilo" SOURCE <- getURL(url,encoding="UTF-8") PARSED <- htmlParse(SOURCE) links <- xpathSApply(PARSED, "//@href") shortUrl <- links[grep("//gu\\.",links)][1] require(stringr) temp <- unlist(str_split(shortUrl,"/")) target <- temp[length(temp)] discussionURL <- paste0("http://discussion.theguardian.com/discussion/p/",target) SOURCE <- getURL(discussionURL,encoding="UTF-8") PARSED <- htmlParse(SOURCE) xpathSApply(PARSED, "//a[@class='d2-username']",xmlValue) unique(c("r","fes","r")) duplicated author="By Rolf" author <- gsub("By ","",author) author #Solutions (1: Mirror)========================= #MIRROR url <- "http://www.mirror.co.uk/news/world-news/oscar-pistorius-trial-murder-reeva-3181393" SOURCE <- getURL(url,encoding="UTF-8") PARSED <- htmlParse(SOURCE) title <- xpathSApply(PARSED, "//h1",xmlValue) author <- xpathSApply(PARSED, "//li[@class='author']",xmlValue) time <- xpathSApply(PARSED, "//time[@itemprop='datePublished']/@datetime") #Telegraph===================================== url <- "http://www.telegraph.co.uk/news/uknews/terrorism-in-the-uk/10659904/Former-Guantanamo-detainee-Moazzam-Begg-one-of-four-arrested-on-suspicion-of-terrorism.html" SOURCE <- getURL(url,encoding="UTF-8") PARSED <- htmlParse(SOURCE) title <- xpathSApply(PARSED, "//h1[@itemprop='headline name']",xmlValue) author <- xpathSApply(PARSED, "//p[@class='bylineBody']",xmlValue) time <- xpathSApply(PARSED, "//p[@class='publishedDate']",xmlValue) #Independent=================================== url <- "http://www.independent.co.uk/news/world/asia/leopard-on-the-loose-puts-indian-city-of-meerut-on-lockdown-9151273.html" SOURCE <- getURL(url,encoding="UTF-8") PARSED <- htmlParse(SOURCE) title <- xpathSApply(PARSED, "//h1",xmlValue) author <- xpathSApply(PARSED, "//span[@class='authorName']",xmlValue) time <- xpathSApply(PARSED, "//p[@class='dateline']",xmlValue)