This repository has been archived by the owner on Oct 18, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcablegate.recipe
279 lines (245 loc) · 11.9 KB
/
cablegate.recipe
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
#!/usr/bin/python
# vim:fileencoding=utf-8
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag, NavigableString
import re
__license__ = 'WTFPL'
# Matches two or more '-' caracters alone in the line.
underline_re = re.compile(ur'^-[-\s]+$')
class WikileaksCablegate(BasicNewsRecipe):
title = u'Wikileaks: Cablegate'
publisher = u'Wikileaks'
category = u'Leak, Foreign Policy, USA'
description = u"Today's cablegate leaks from wikileaks."
language = 'en'
remove_tags_before = dict(name='table', attrs={'class':'cable'})
# ? I found that sometimes the <script> at the end won't get removed...
remove_tags = [dict(name='link'), dict(name='code'), dict(name='script')]
remove_javascript = True
max_articles_per_feed = 1000
# Pick a host from the list below. Only one!
# (To choose, just remove the '#' in front and make sure that all the other
# HOSTs have a '#' in front of them.)
#
#HOST = 'http://cablegate.r3blog.nl'
#HOST = 'http://wl.opsec.eu'
#HOST = 'http://87.106.58.253'
#HOST = 'http://cablegate.askedo.de'
HOST = 'http://wikileaks.ch'
# How many days of leaks to download.
#
DAYS = 2
# How many chars must a line have to stop considering it as a header or
# subtitle.
#
BREAKLINE_LENGTH = 50
def get_masthead_url(self):
return self.HOST + '/static/gfx/WL_Hour_Glass_small.png'
def get_browser(self):
br = BasicNewsRecipe.get_browser()
# Some mirrors seems to always send gzip content... even when not indicated as
# supported by the HTTP Headers. Thankfully this makes it transparent.
br.set_handle_gzip(True)
return br
def cables_to_articles(self, cablessoup):
articles = []
for cable in cablessoup.findAll('tr'):
if cable.find('th') is not None:
continue
entries = cable.findAll('td')
cable_url = self.HOST + entries[0].a['href']
cable_title= self.tag_to_string(entries[1])
cable_date = self.tag_to_string(entries[2])
cable_desc = self.tag_to_string(entries[4]) + ' ' + self.tag_to_string(entries[5])
articles.append({'title' : cable_title.strip(),
'date' : cable_date.strip(),
'url' : cable_url.strip(),
'description' : cable_desc.strip()})
#print "Article:",cable_title,cable_date,cable_url
return articles
def parse_index(self):
cablegate = self.index_to_soup(self.HOST + '/cablegate.html')
cable_dates = []
cable_urls = []
# Get the last DAYS
for h3 in cablegate.findAll('h3'):
if h3.string and h3.string == 'Browse latest releases':
#look for divs
sibling = h3.nextSibling
aes = []
while isinstance(sibling, NavigableString) or sibling.name != 'h3':
if isinstance(sibling, Tag) and sibling.name == 'div' and sibling['class'] == 'sort':
aes.extend(sibling.findAll('a'))
sibling = sibling.nextSibling
aes.reverse()
#print aes
for i in xrange(min(self.DAYS, len(aes))):
cable_dates.append(aes[i].string)
cable_urls.append(self.HOST + aes[i]['href'])
break
feeds = []
for url,date in zip(cable_urls, cable_dates):
articles = []
print url
soup = self.index_to_soup(url)
aes = soup.find('div','paginator').findAll('a')
total_pages = int(aes[-2].string)
# Get the URL for the additional pages.
# It is the same as the url for this page, but the digit after the
# '_' changes.
#
# We're using the path in the href instead of the full url to avoid
# problems if the mirror URL itself includes another '_'
#
# We also skip the first additional_url, since that would be the
# same as what we already got in url.
additional_url = [self.HOST+aes[0]['href'].split('_')[0]+'_%s.html'% x for x in xrange(total_pages)][1:]
#print 'Additional:',additional_url
articles.extend(self.cables_to_articles(soup.find('table','cable')))
for url in additional_url:
page_soup = self.index_to_soup(url)
articles.extend(self.cables_to_articles(page_soup.find('table','cable')))
feeds.append( (date, articles) )
return feeds
def should_br(self, string, soup):
"""Returns true if 'string' should be on a line of its own when appended into 'soup'"""
if underline_re.match(string) != None:
print string, "should br because its separator."
return True
if len(string) < self.BREAKLINE_LENGTH:
if len(soup.contents) > 0 and isinstance(soup.contents[-1], NavigableString) and len(soup.contents[-1].string) > 0:
s = soup.contents[-1].string.rstrip()
if s[-1] == u'.':
print string, "should br because it's lest than",self.BREAKLINE_LENGTH,"and last string ended with a dot."
return True
if string.isupper() and not s.isupper():
# Sudden change of case, it might be the page count which
# sometimes appears in the middle of the cable or a tittle.
print string, "should br because it's less than",self.BREAKLINE_LENGTH,"and it's all uppercase and last string wasnt ('%s')" % s
return True
return False
#print string, "should br because soup is empty or last instance wasn't a string or it was an empty string"
#print "what was it? len:", len(soup.contents)
#if len(soup.contents) > 0:
# print "what was it? type:", type(soup.contents[-1])
# if isinstance(soup.contents[-1], NavigableString):
# print "what was it? string-len:",len(soup.contents[-1].string),"string:",soup.contents[-1].string
#return True
return False
def starts_nl(self, soup):
"""Returns true if the next appended soup element would start in a new line"""
if len(soup.contents) == 0:
# We'll asume that it's always the case... it is not for certain tags
# (inline ones, like 'span', or 'i', 'b' and so on).
return True
if isinstance(soup.contents[-1], Tag) and (soup.contents[-1].name == 'p' or soup.contents[-1].name == 'br'):
return True
return False
def preprocess_split_paragraphs(self, tag, soup, alwaysbr=False):
newsoup = Tag(soup, 'div')
# reached_pars is set to true whenever we find a <a href=... id=...> tag
# which encloses the start and end of paragraphs.
reached_pars = False
was_str = False
for elem in tag:
if isinstance(elem, NavigableString):
s = unicode(elem)
for s in (t.strip() for t in s.split(u'

')):
#print 'S:',s
p = None
# This fixes a problem caused by the way .replaceWith works
# with navigable strings. This was causing spurious
# paragraphs to be inserted in the tags.
if not was_str:
p = Tag(soup, 'p')
else:
p = newsoup.contents[-1]
if s.find('
') == -1:
# Add a space to compensate for the strip() (which
# might be merging words together).
p.append(s+u' ')
else:
for l in (t.strip() for t in s.split(u'
')):
if len(l) == 0:
# Avoid useless 'br's
continue
if alwaysbr or not reached_pars:
# Always insert breaklines. Used for headers.
p.append(l)
p.append(Tag(soup, 'br'))
elif self.should_br(l, p):
# Try to guess if a breakline should be
# included. Used for headers within the cable
# and subtitles.
if not self.starts_nl(p):
p.append(Tag(soup, 'br'))
p.append(l)
p.append(Tag(soup, 'br'))
else:
# No breakline required, probably within a
# paragraph.
# Add a space to compensate for the strip (which
# might be merging word together).
p.append(l+u' ')
if was_str:
was_str = False
elif not self.tag_to_string(p).strip() == u'':
newsoup.append(p)
was_str = True
else: #Assuming is a node... I don't think it can be anything else.
if elem.name == 'a' and elem.has_key('id'):
reached_pars = True
was_str = False
return newsoup
def populate_article_metadata(self, article, soup, first):
title = article.title
if title:
div = soup.find('div','main')
h3 = Tag(soup,'h3')
h3.append(title)
div.insert(0, h3)
def preprocess_html(self, soup):
head = soup.find('table','cable')
headers = head.tr
headers.extract()
headers_values = head.tr
headers_values.extract()
div = Tag(soup, 'div')
for i in xrange(len(headers)):
if self.tag_to_string(headers.contents[i]).strip() == '':
continue
h = Tag(soup, 'div')
hh= Tag(soup, 'span')
hh['style'] = 'font-weight: bold;'
hh.insert(0, NavigableString(self.tag_to_string(headers.contents[i]).strip()+u':'))
hv= Tag(soup, 'span')
hv['style'] = 'text-align: right;'
hv.insert(0, NavigableString(self.tag_to_string(headers_values.contents[i])))
h.insert(0, hh)
h.insert(1, hv)
div.insert(i, h)
head.replaceWith(div)
# Get rid of the links in the cable tags
for alink in soup.findAll(lambda tag: tag.name == 'a' and not tag.has_key('id')):
tstr = alink.string
alink.replaceWith(tstr)
# Try to get usable linebreaks and paragraphs out of the
# barely-formatted text.
# The main problem is that keeping the exact same linebreaks doesn't
# make sense for a ebook reader, since some lines will look funny, and
# the whole point of being able to change the font size is that the
# flow of text will be reshaped correctly (and that won't happen if we
# force the same linebreaks).
start = 1
pres = soup.findAll('pre')
pres[0].replaceWith(self.preprocess_split_paragraphs(pres[0], soup, True))
if len(pres) > 2:
# Some cables only have 2 'pre' tags, the second contains the second
# header AND the content, we don't want to mess the linebreaks
# in those.
pres[1].replaceWith(self.preprocess_split_paragraphs(pres[1], soup, True))
start = 2
for pre in pres[start:]:
pre.replaceWith(self.preprocess_split_paragraphs(pre, soup))
#print soup.prettify()
return soup