#!/usr/bin/env python __license__ = 'GPL v3' __copyright__ = '2010, matek09, matek09@gmail.com' from calibre.web.feeds.news import BasicNewsRecipe class Polityka(BasicNewsRecipe): title = u'Polityka' __author__ = 'matek09' description = 'Weekly magazine. Last archive issue' encoding = 'utf-8' no_stylesheets = True language = 'pl' remove_javascript = True remove_tags_before = dict(dict(name = 'h2', attrs = {'class' : 'box_nag'})) remove_tags_after = dict(dict(name = 'div', attrs = {'class' : 'box_footer'})) remove_tags =[] remove_tags.append(dict(name = 'h2', attrs = {'class' : 'box_nag'})) remove_tags.append(dict(name = 'div', attrs = {'class' : 'box_footer'})) extra_css = ''' h1 {font-size: x-large; font-weight: bold} ''' def parse_index(self): soup = self.index_to_soup('http://archiwum.polityka.pl/') box_img3 = soup.findAll(attrs={'class' : 'box_img3'}) feeds = [] last = 0 self.cover_url = 'http://archiwum.polityka.pl' + box_img3[-1].find('img')['src'] last_edition = 'http://archiwum.polityka.pl' + box_img3[-1].find('a')['href'] while True: index = self.index_to_soup(last_edition) box_list = index.findAll('div', attrs={'class' : 'box_list'}) if len(box_list) == 0: break articles = {} for box in box_list: for div in box.findAll('div', attrs={'class': 'list_tresc'}): article_page = self.index_to_soup('http://archiwum.polityka.pl' + div.a['href'],) section = self.tag_to_string(article_page.find('h2', attrs = {'class' : 'box_nag'})).split('/')[0].lstrip().rstrip() if not articles.has_key(section): articles[section] = [] articles[section].append( { 'title' : self.tag_to_string(div.a), 'url' : 'http://archiwum.polityka.pl' + div.a['href'], 'date' : '', 'description' : '' }) for section in articles: feeds.append((section, articles[section])) last_edition = last_edition.replace('http://archiwum.polityka.pl/wydanie/' + str(last), 'http://archiwum.polityka.pl/wydanie/' + str(last + 1)) last = last + 1 return feeds