-
Notifications
You must be signed in to change notification settings - Fork 7
/
feedparser.py
129 lines (109 loc) · 3.52 KB
/
feedparser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#! /usr/bin/env python3
# -*- coding: utf-8; py-indent-offset: 4 -*-
#
# Author: Linuxfabrik GmbH, Zurich, Switzerland
# Contact: info (at) linuxfabrik (dot) ch
# https://www.linuxfabrik.ch/
# License: The Unlicense, see LICENSE file.
# https://github.com/Linuxfabrik/monitoring-plugins/blob/main/CONTRIBUTING.rst
"""Parse Atom and RSS feeds in Python.
Time zone handling is not implemented.
"""
__author__ = 'Linuxfabrik GmbH, Zurich/Switzerland'
__version__ = '2024041001'
import sys
from .globals import STATE_UNKNOWN
try:
from bs4 import BeautifulSoup
except ImportError as e:
print('Python module "BeautifulSoup4" is not installed.')
sys.exit(STATE_UNKNOWN)
from . import time
from . import url
def parse_atom(soup):
result = {}
result['title'] = soup.title.string
result['updated'] = soup.updated.string
# cut the timezone part
result['updated_parsed'] = time.timestr2datetime(
result['updated'][0:19],
pattern='%Y-%m-%dT%H:%M:%S',
)
result['entries'] = []
for entry in soup.find_all('entry'):
tmp = {}
tmp['title'] = entry.title.string
tmp['id'] = entry.id.string
tmp['updated'] = entry.updated.string
# cut the timezone part
tmp['updated_parsed'] = time.timestr2datetime(
tmp['updated'][0:19],
pattern='%Y-%m-%dT%H:%M:%S',
)
try:
soup = BeautifulSoup(entry.summary.string, 'lxml')
tmp['summary'] = soup.get_text()
except:
try:
soup = BeautifulSoup(entry.content.string, 'lxml')
tmp['summary'] = soup.get_text()
except:
pass
result['entries'].append(tmp)
return result
def parse_rss(soup):
result = {}
result['title'] = soup.rss.channel.title.string
try:
result['updated'] = soup.rss.channel.pubDate.string
except:
try:
result['updated'] = soup.rss.channel.lastBuildDate.string
except:
return result
# cut the timezone part from "Wed, 10 Apr 2024 06:12:00 Z"
result['updated_parsed'] = time.timestr2datetime(
result['updated'][0:25],
pattern='%a, %d %b %Y %H:%M:%S',
)
result['entries'] = []
for entry in soup.find_all('item'):
tmp = {}
tmp['title'] = entry.title.string
tmp['id'] = entry.guid.string
tmp['updated'] = entry.pubDate.string
# cut the timezone part
tmp['updated_parsed'] = time.timestr2datetime(
tmp['updated'][0:25],
pattern='%a, %d %b %Y %H:%M:%S',
)
try:
soup = BeautifulSoup(entry.description.string, 'lxml')
tmp['summary'] = soup.get_text()
except:
pass
result['entries'].append(tmp)
return result
def parse(feed_url, insecure=False, no_proxy=False, timeout=5, encoding='urlencode'):
"""Parse a feed from a URL, file, stream, or string.
"""
success, xml = url.fetch(
feed_url,
encoding=encoding,
insecure=insecure,
no_proxy=no_proxy,
timeout=timeout,
)
if not success:
return (False, xml)
try:
soup = BeautifulSoup(xml, 'xml')
except Exception as e:
return (False, e)
is_atom = soup.feed
if is_atom is not None:
return (True, parse_atom(soup))
is_rss = soup.rss
if is_rss is not None:
return (True, parse_rss(soup))
return (False, '{} does not seem to be an Atom or RSS feed I understand.'.format(feed_url))