#!/usr/bin/python # version 0.1 Wenming Ye 2/25/2012 #Extract English and Text only content out of the Gutenberg DVD. 2010 # If you have questions, please contact me for the latest version. # feel free to modify the scripts to your needs. # STEP 1: Run this in the Cygwin Environment. if you don't want to use Cygwin, you can modify "cp command embeded in the script". # This file parses the html index pages (TITLES) and find english Language books and their ZIP resource URLs. # Run this in the gutenberg main INDEXES dir in gutenberg "www.gutenberg.org/INDEXES" # Removes pdf, html, and images, and non-english items. All the zip files will be copied into the INDEXES/zips # STEP 2: Then you can extract all the zip files by running >>>>find ./ -name "*.zip" -exec unzip -o {} \;<<<< # STEP 3: find ./ -name "*.txt" // that's your list of text. You should see about 26942 total # of text files. # STEP 4: remove *readme.txt you can use the find utility again. find ./ -name "*readme.txt" -exec rm {} \; # STEP 5: YOU CAN DO THAT FOR htm, html, etc. # You will end up with 26900 relatively clean set of files. find ./ -name "*.txt" -exec cp {} my_text_dir \; # TODO: get rid of UTF8 duplicates vs. ASCII. from HTMLParser import HTMLParser from htmlentitydefs import name2codepoint import urllib import os import commands # Class for parsing Book HTML page to extract the ZIP (actual URL for the books). class BookPropertyHTMLParser(HTMLParser): def __init__(self): HTMLParser.__init__(self) self.url_list = [] def handle_starttag(self, tag, attrs): if (tag == "a"): for attr in attrs: attr_string = "".join(attr) attr_string = attr_string[4:] if ((attr_string.count(".zip") != 0) or (attr_string.count(".txt") != 0)): if (attr_string.count("h.zip") != 0):# remove anything that ends with h.zip( verfified) continue # pass, do nothing elif (attr_string.count("_images.zip") != 0): continue # pass, do nothing elif (attr_string.count("_pdf.zip") != 0): # pass, do nothing continue else: self.url_list.append(attr_string) commands.getstatusoutput('cp ' + attr_string + " zips") # change to xcopy for windows cmd. # parsing the title page to find any English language book (English) class TitleFilesHTMLParser(HTMLParser): def __init__(self): HTMLParser.__init__(self) self.book_title = "" self.book_attr = "" self.book_property_list = [] def handle_starttag(self, tag, attrs): if (tag == "h3"): self.book_title = "" self.book_attr = "" if (tag == "a"): for attr in attrs: self.book_attr += "".join(attr) self.book_attr = self.book_attr[4:] def handle_endtag(self, tag): if (tag == "h3"): if (self.book_title.count("(English)") !=0): self.book_property_list.append(self.book_attr.upper()) def handle_data(self, data): self.book_title += data # get the zip URLs on the Book HTML property page. def get_zip_urls(book_url): book_url_string = "file://"+os.getcwd()+"/" + book_url book_page_file = urllib.urlopen(book_url_string) book_page_file_string = book_page_file.read() book_page_file.close() book_page_parser = BookPropertyHTMLParser() book_page_parser.feed(book_page_file_string) print book_url, book_page_parser.url_list # you might want to get rid of duplicates for each book. Some of them have utf8, and ASCII. # Loop through the title page and find all the Book Property URLs. def get_english_only_urls(title_page_url): file = urllib.urlopen(title_page_url) file_string = file.read() file.close() parser = TitleFilesHTMLParser() parser.feed(file_string) global total_books total_books += len(parser.book_property_list) #parser.book_property_list = [] #parser.book_property_list.append('../etext/28964.html') # go parse each file and get the zip file URL for book_url in parser.book_property_list: get_zip_urls(book_url) # MAIN FUNCTION HERE run this in the gutenberg main INDEXES dir in gutenberg "www.gutenberg.org/INDEXES" # get a list of the title pages a-z, other if not os.path.exists("zips"): os.makedirs("zips") titleFileList = [] total_books = 0 for i in range(ord('A'), ord('Z')+1): titleFileList.append(chr(i)) titleFileList.append('OTHER') # now for the title page list, find the URL for the Book's HTML description page. # On the description page extract the ZIP file URL for the actual book for i in titleFileList: title_page_url = "file://"+os.getcwd()+"/TITLES_" + i + ".HTML" get_english_only_urls(title_page_url) print total_books #