__license__ = 'GPL v3' __copyright__ = '2011, Attis ' __version__ = 'v. 0.1' import re from calibre.web.feeds.recipes import BasicNewsRecipe class KopalniaWiedzy(BasicNewsRecipe): title = u'Kopalnia Wiedzy' publisher = u'Kopalnia Wiedzy' description = u'Ciekawostki ze świata nauki i techniki' encoding = 'utf-8' __author__ = 'Attis' language = 'pl' oldest_article = 7 max_articles_per_feed = 100 INDEX = u'http://kopalniawiedzy.pl/' remove_javascript = True no_stylesheets = True remove_tags = [{'name':'p', 'attrs': {'class': 'keywords'} }, {'name':'div', 'attrs': {'class':'sexy-bookmarks sexy-bookmarks-bg-caring'}}] remove_tags_after = dict(attrs={'class':'ad-square'}) keep_only_tags = [dict(name="div", attrs={'id':'articleContent'})] extra_css = '.topimage {margin-top: 30px}' preprocess_regexps = [ (re.compile(u''), lambda match: '' ), (re.compile(u'

'), lambda match: '') ] feeds = [ (u'Biologia', u'http://kopalniawiedzy.pl/wiadomosci_biologia.rss'), (u'Medycyna', u'http://kopalniawiedzy.pl/wiadomosci_medycyna.rss'), (u'Psychologia', u'http://kopalniawiedzy.pl/wiadomosci_psychologia.rss'), (u'Technologie', u'http://kopalniawiedzy.pl/wiadomosci_technologie.rss'), (u'Ciekawostki', u'http://kopalniawiedzy.pl/wiadomosci_ciekawostki.rss'), (u'Artykuły', u'http://kopalniawiedzy.pl/artykuly.rss') ] def is_link_wanted(self, url, tag): return tag['class'] == 'next' def remove_beyond(self, tag, next): while tag is not None and getattr(tag, 'name', None) != 'body': after = getattr(tag, next) while after is not None: ns = getattr(tag, next) after.extract() after = ns tag = tag.parent def append_page(self, soup, appendtag, position): pager = soup.find('a',attrs={'class':'next'}) if pager: nexturl = self.INDEX + pager['href'] soup2 = self.index_to_soup(nexturl) texttag = soup2.find('div', attrs={'id':'articleContent'}) tag = texttag.find(attrs={'class':'pages'}) self.remove_beyond(tag, 'nextSibling') newpos = len(texttag.contents) self.append_page(soup2,texttag,newpos) appendtag.insert(position,texttag) def preprocess_html(self, soup): self.append_page(soup, soup.body, 3) for item in soup.findAll('div',attrs={'class':'pages'}): item.extract() for item in soup.findAll('p', attrs={'class':'wykop'}): item.extract() return soup