cablegate.recipe

#!/usr/bin/python
# vim:fileencoding=utf-8

from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag, NavigableString

import re

__license__ = 'WTFPL'

# Matches two or more '-' caracters alone in the line.
underline_re = re.compile(ur'^-[-\s]+$')

class WikileaksCablegate(BasicNewsRecipe):
    title          = u'Wikileaks: Cablegate'
    publisher      = u'Wikileaks'
    category       = u'Leak, Foreign Policy, USA'
    description    = u"Today's cablegate leaks from wikileaks."
    language       = 'en'

    remove_tags_before = dict(name='table', attrs={'class':'cable'})
    # ? I found that sometimes the <script> at the end won't get removed...
    remove_tags = [dict(name='link'), dict(name='code'), dict(name='script')]
    remove_javascript = True
    
    max_articles_per_feed = 1000

    # Pick a host from the list below. Only one!
    # (To choose, just remove the '#' in front and make sure that all the other
    # HOSTs have a '#' in front of them.)
    #
    #HOST = 'http://cablegate.r3blog.nl'
    #HOST = 'http://wl.opsec.eu'
    #HOST = 'http://87.106.58.253'
    #HOST = 'http://cablegate.askedo.de'
    HOST = 'http://wikileaks.ch'

    # How many days of leaks to download.
    #
    DAYS = 2

    # How many chars must a line have to stop considering it as a header or
    # subtitle.
    #
    BREAKLINE_LENGTH = 50

    def get_masthead_url(self):
        return self.HOST + '/static/gfx/WL_Hour_Glass_small.png'

    def get_browser(self):
        br = BasicNewsRecipe.get_browser()
        # Some mirrors seems to always send gzip content... even when not indicated as
        # supported by the HTTP Headers. Thankfully this makes it transparent.
        br.set_handle_gzip(True)
        return br


    def cables_to_articles(self, cablessoup):
        articles = []
        for cable in cablessoup.findAll('tr'):
            if cable.find('th') is not None:
                continue
            entries = cable.findAll('td')

            cable_url  = self.HOST + entries[0].a['href']
            cable_title= self.tag_to_string(entries[1])
            cable_date = self.tag_to_string(entries[2])
            cable_desc = self.tag_to_string(entries[4]) + ' ' + self.tag_to_string(entries[5])
            articles.append({'title' : cable_title.strip(),
                'date' : cable_date.strip(),
                'url' : cable_url.strip(),
                'description' : cable_desc.strip()})
            #print "Article:",cable_title,cable_date,cable_url
        return articles


    def parse_index(self):
        cablegate = self.index_to_soup(self.HOST + '/cablegate.html')
        cable_dates = []     
        cable_urls  = []
        # Get the last DAYS
        for h3 in cablegate.findAll('h3'):
            if h3.string and h3.string == 'Browse latest releases':
                #look for divs
                sibling = h3.nextSibling
                aes = []
                while isinstance(sibling, NavigableString) or sibling.name != 'h3':
                    if isinstance(sibling, Tag) and sibling.name == 'div' and sibling['class'] == 'sort':
                        aes.extend(sibling.findAll('a'))
                    sibling = sibling.nextSibling
                aes.reverse()
                #print aes
                for i in xrange(min(self.DAYS, len(aes))):
                    cable_dates.append(aes[i].string)
                    cable_urls.append(self.HOST + aes[i]['href'])
                break


        feeds = []
        for url,date in zip(cable_urls, cable_dates):
            articles = []
            print url
            soup = self.index_to_soup(url)
            aes = soup.find('div','paginator').findAll('a')
            
            total_pages = int(aes[-2].string)
            # Get the URL for the additional pages.
            # It is the same as the url for this page, but the digit after the
            # '_' changes.
            #
            # We're using the path in the href instead of the full url to avoid
            # problems if the mirror URL itself includes another '_'
            #
            # We also skip the first additional_url, since that would be the
            # same as what we already got in url.
            additional_url = [self.HOST+aes[0]['href'].split('_')[0]+'_%s.html'% x for x in xrange(total_pages)][1:]

            #print 'Additional:',additional_url

            articles.extend(self.cables_to_articles(soup.find('table','cable')))
            for url in additional_url:
                page_soup = self.index_to_soup(url)
                articles.extend(self.cables_to_articles(page_soup.find('table','cable')))
            feeds.append( (date, articles) )

        return feeds

    def should_br(self, string, soup):
        """Returns true if 'string' should be on a line of its own when appended into 'soup'"""
        if underline_re.match(string) != None:
            print string, "should br because its separator."
            return True
        if len(string) < self.BREAKLINE_LENGTH:
            if len(soup.contents) > 0 and isinstance(soup.contents[-1], NavigableString) and len(soup.contents[-1].string) > 0:
                s = soup.contents[-1].string.rstrip()
                if s[-1] == u'.':
                    print string, "should br because it's lest than",self.BREAKLINE_LENGTH,"and last string ended with a dot."
                    return True
                if string.isupper() and not s.isupper():
                    # Sudden change of case, it might be the page count which
                    # sometimes appears in the middle of the cable or a tittle.
                    print string, "should br because it's less than",self.BREAKLINE_LENGTH,"and it's all uppercase and last string wasnt ('%s')" % s
                    return True
                return False
            #print string, "should br because soup is empty or last instance wasn't a string or it was an empty string"
            #print "what was it? len:", len(soup.contents)
            #if len(soup.contents) > 0:
            #    print "what was it? type:", type(soup.contents[-1])
            #    if isinstance(soup.contents[-1], NavigableString):
            #        print "what was it? string-len:",len(soup.contents[-1].string),"string:",soup.contents[-1].string
            #return True
        return False

    def starts_nl(self, soup):
        """Returns true if the next appended soup element would start in a new line"""
        if len(soup.contents) == 0:
            # We'll asume that it's always the case... it is not for certain tags
            # (inline ones, like 'span', or 'i', 'b' and so on).
            return True
        if isinstance(soup.contents[-1], Tag) and (soup.contents[-1].name == 'p' or soup.contents[-1].name == 'br'):
            return True
        return False

    def preprocess_split_paragraphs(self, tag, soup, alwaysbr=False):
        newsoup = Tag(soup, 'div')
        # reached_pars is set to true whenever we find a <a href=... id=...> tag
        # which encloses the start and end of paragraphs.
        reached_pars = False
        was_str = False
        for elem in tag:
            if isinstance(elem, NavigableString):
                s = unicode(elem)
                for s in (t.strip() for t in s.split(u'&#x000A;&#x000A;')):
                    #print 'S:',s
                    p = None
                    # This fixes a problem caused by the way .replaceWith works
                    # with navigable strings. This was causing spurious
                    # paragraphs to be inserted in the tags.
                    if not was_str:
                        p = Tag(soup, 'p')
                    else:
                        p = newsoup.contents[-1]
                    if s.find('&#x000A;') == -1:
                        # Add a space to compensate for the strip() (which
                        # might be merging words together).
                        p.append(s+u' ')
                    else:
                        for l in (t.strip() for t in s.split(u'&#x000A;')):
                            if len(l) == 0:
                                # Avoid useless 'br's
                                continue
                            if alwaysbr or not reached_pars:
                                # Always insert breaklines. Used for headers.
                                p.append(l)
                                p.append(Tag(soup, 'br'))
                            elif self.should_br(l, p):
                                # Try to guess if a breakline should be
                                # included. Used for headers within the cable
                                # and subtitles.
                                if not self.starts_nl(p):
                                    p.append(Tag(soup, 'br'))
                                p.append(l)
                                p.append(Tag(soup, 'br'))
                            else:
                                # No breakline required, probably within a
                                # paragraph.
                                # Add a space to compensate for the strip (which
                                # might be merging word together).
                                p.append(l+u' ')
                    if was_str:
                        was_str = False
                    elif not self.tag_to_string(p).strip() == u'':
                        newsoup.append(p)
                was_str = True
            else: #Assuming is a node... I don't think it can be anything else.
                if elem.name == 'a' and elem.has_key('id'):
                    reached_pars = True    
                was_str = False
        return newsoup
                
                
    def populate_article_metadata(self, article, soup, first):
        title = article.title
        if title:
            div = soup.find('div','main')
            h3 = Tag(soup,'h3')
            h3.append(title)
            div.insert(0, h3)
                

    def preprocess_html(self, soup):
        head = soup.find('table','cable')
        headers = head.tr
        headers.extract()
        headers_values = head.tr
        headers_values.extract()
        div = Tag(soup, 'div')
        for i in xrange(len(headers)):
            if self.tag_to_string(headers.contents[i]).strip() == '':
                continue
            h = Tag(soup, 'div')
            hh= Tag(soup, 'span')
            hh['style'] = 'font-weight: bold;'
            hh.insert(0, NavigableString(self.tag_to_string(headers.contents[i]).strip()+u':'))
            hv= Tag(soup, 'span')
            hv['style'] = 'text-align: right;'
            hv.insert(0, NavigableString(self.tag_to_string(headers_values.contents[i])))
            h.insert(0, hh)
            h.insert(1, hv)
            div.insert(i, h)
        head.replaceWith(div)

        # Get rid of the links in the cable tags
        for alink in soup.findAll(lambda tag: tag.name == 'a' and not tag.has_key('id')):
            tstr = alink.string
            alink.replaceWith(tstr)        

        # Try to get usable linebreaks and paragraphs out of the
        # barely-formatted text.
        # The main problem is that keeping the exact same linebreaks doesn't
        # make sense for a ebook reader, since some lines will look funny, and
        # the whole point of being able to change the font size is that the 
        # flow of text will be reshaped correctly (and that won't happen if we
        # force the same linebreaks).
        start = 1
        pres = soup.findAll('pre')
        pres[0].replaceWith(self.preprocess_split_paragraphs(pres[0], soup, True))
        if len(pres) > 2:
            # Some cables only have 2 'pre' tags, the second contains the second
            # header AND the content, we don't want to mess the linebreaks
            # in those.
            pres[1].replaceWith(self.preprocess_split_paragraphs(pres[1], soup, True))
            start = 2
        for pre in pres[start:]:
            pre.replaceWith(self.preprocess_split_paragraphs(pre, soup))

        #print soup.prettify()

        return soup