-
-
Notifications
You must be signed in to change notification settings - Fork 1
/
main.py
executable file
·188 lines (165 loc) · 9.93 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import urllib.parse
import requests
import sqlite3
import time
import json
import sys
import re
import os
class DB:
path = os.path.dirname(os.path.realpath(__file__))
con = sqlite3.connect(path + '/db.sqlite3')
cur = con.cursor()
def update():
DB.cur.execute('CREATE TABLE IF NOT EXISTS people (firstname TEXT NOT NULL DEFAULT "", lastname TEXT NOT NULL DEFAULT "", sex TEXT, birthdate DATE, birthplace TEXT, deathdate DATE, deathplace TEXT, permalink TEXT PRIMARY KEY, family_id INT, CONSTRAINT `unique_permalink` UNIQUE(permalink) ON CONFLICT REPLACE)')
DB.cur.execute('CREATE TABLE IF NOT EXISTS family (id TEXT PRIMARY KEY, father_permalink TEXT, mother_permalink TEXT, wedding_date DATE, wedding_place TEXT, CONSTRAINT `unique_id` UNIQUE(id) ON CONFLICT REPLACE)')
DB.con.commit()
class Family:
instances = {}
def __init__(self, father_permalink, mother_permalink):
self.id = father_permalink + '#' + mother_permalink
self.father_permalink = father_permalink
self.mother_permalink = mother_permalink
self.wedding_date = self.wedding_place = ''
Family.instances[self.id] = self
def get(father_permalink, mother_permalink):
family_id = father_permalink + '#' + mother_permalink
return Family.instances[family_id] if family_id in Family.instances.keys() else Family(father_permalink, mother_permalink)
def save(self):
DB.cur.execute('INSERT INTO family (id, father_permalink, mother_permalink, wedding_date, wedding_place) VALUES (?, ?, ?, ?, ?)', (self.id, self.father_permalink, self.mother_permalink, self.wedding_date, self.wedding_place))
class People:
def __init__(self, permalink, firstname = '', lastname = '', sex = '', birthdate = '', birthplace = '', deathdate = '', deathplace = '', family_id = ''):
self.permalink = permalink
self.firstname = firstname
self.lastname = lastname
self.sex = sex
self.birthdate = birthdate
self.birthplace = birthplace
self.deathdate = deathdate
self.deathplace = deathplace
self.permalink = permalink
self.family_id = family_id
def __str__(self):
return ' '.join(map(str, (self.sex, self.firstname, self.lastname, self.permalink, self.birthdate, self.deathdate, self.birthplace, self.deathplace)))
def save(self, DB):
DB.cur.execute('INSERT INTO people (firstname, lastname, sex, birthdate, birthplace, deathdate, deathplace, permalink) VALUES (?, ?, ?, ?, ?, ?, ?, ?)', (self.firstname, self.lastname, self.sex, self.birthdate, self.birthplace, self.deathdate, self.deathplace, self.permalink))
class Process:
base = 'http://roglo.eu/roglo?'
def __init__(self, filename):
self.filename = filename
self.cache = {}
def init_caches(self):
if not len(self.cache) and os.path.isfile(self.filename) and os.path.getmtime(self.filename) > time.time() - 12 * 3600 and os.path.getsize(self.filename) > 0:
with open(self.filename, 'r', encoding='utf-8') as f:
cache = json.load(f)
DB.cur.execute('SELECT firstname, lastname, sex, birthdate, birthplace, deathdate, deathplace, permalink, family_id FROM people')
for (firstname, lastname, sex, birthdate, birthplace, deathdate, deathplace, permalink, family_id) in DB.cur.fetchall():
people = People(permalink, firstname, lastname, sex, birthdate, birthplace, deathdate, deathplace, family_id)
for (k, v) in cache.items():
if v == people.permalink:
self.cache[k] = people
continue
def save_caches(self):
with open(self.filename, 'w') as f:
for (k, v) in self.cache.items():
self.cache[k] = v.permalink
json.dump(self.cache, f)
def extractParams(href):
str1 = Process.extractQuery(href)
return {x[0] : x[1] for x in [x.split("=") for x in str1[1:].split(";") ]}
def extractQuery(href):
parts = href.split('?')
return parts[1] if len(parts) > 1 else ''
def dictToDate(d):
if 'yg' in d.keys() and 'mg' in d.keys() and 'dg' in d.keys():
return d['yg'] + '-' + d['mg'].zfill(2) + '-' + d['dg'].zfill(2)
elif 'yg' in d.keys():
return d['yg']
return ''
def browse(self, path):
response = requests.get(Process.base + path)
parts = response.text.split('<h3')
soup = BeautifulSoup(parts[0], "html.parser")
permalink_ = soup.select('h1 input')[0]['value'].strip() if len(soup.select('h1 input')) > 0 else ''
parts = permalink_.replace('[', '').replace(']', '').split('/')
permalink = ('p=%s;n=%s;' % (parts[0], parts[1]) + ('oc=%s' % (parts[2],) if parts[2] !='0' else '')) if len(parts) > 2 else ''
people = People(permalink)
people.sex = soup.select('h1 img')[0]['alt'].strip() if len(soup.select('h1 img')) > 0 else ''
people.firstname = soup.select('h1 a')[0].text.strip() if len(soup.select('h1 a')) > 0 else ''
people.lastname = soup.select('h1 a')[1].text.strip() if len(soup.select('h1 a')) > 1 else ''
dict1 = Process.extractParams(soup.select('ul li a.date')[0]['href'].strip()) if len(soup.select('ul li a.date')) > 0 else {}
people.birthdate = Process.dictToDate(dict1)
dict2 = Process.extractParams(soup.select('ul li a.date')[1]['href'].strip()) if len(soup.select('ul li a.date')) > 1 else {}
people.deathdate = Process.dictToDate(dict2)
people.birthplace = soup.select('ul li script')[0].text.strip().split('"')[1] if len(soup.select('ul li script')) > 0 else ''
people.deathplace = soup.select('ul li script')[1].text.strip().split('"')[1] if len(soup.select('ul li script')) > 1 else ''
print(people)
people.save(DB)
self.cache[path] = people
DB.con.commit()
soup = BeautifulSoup(response.text, "html.parser")
parents = soup.find('h3', text='Parents')
if parents:
ul = parents.findNext('ul')
links = ul.findAll('li')
father_ = Process.extractQuery(links[0].find('a')['href'].strip()) if len(links) > 0 else ''
if father_:
father = self.cache[father_] if father_ in self.cache.keys() else self.browse(father_)
father_permalink = father.permalink if father else ''
mother_ = Process.extractQuery(links[1].find('a')['href'].strip()) if len(links) > 1 else ''
if mother_:
mother = self.cache[mother_] if mother_ in self.cache.keys() else self.browse(mother_)
mother_permalink = mother.permalink if mother else ''
if father_permalink or mother_permalink:
family = Family.get(father_permalink, mother_permalink)
family.save()
DB.cur.execute('UPDATE people SET family_id = ? WHERE permalink = ?', (family.id, people.permalink))
spouses = soup.find('h3', text='Spouses and children')
if spouses:
ul = spouses.findNext('ul')
links = ul.findAll('b')
spouse_ = Process.extractQuery(links[0].find('a')['href'].strip()) if len(links) > 0 else ''
if spouse_ and spouse_ not in self.cache.keys():
spouse = self.browse(spouse_)
dict1 = Process.extractParams(ul.select('li a.date')[0]['href'].strip()) if len(ul.select('li a.date')) > 0 else {}
wedding_date = Process.dictToDate(dict1)
wedding_place = ul.select('li script')[0].text.strip().split('"')[1] if len(ul.select('li script')) > 0 else ''
father_permalink = people.permalink if people.sex == 'M' else spouse.permalink
mother_permalink = spouse.permalink if people.sex == 'M' else people.permalink
family = Family.get(father_permalink, mother_permalink)
family.wedding_date = wedding_date
family.wedding_place = wedding_place
family.save()
print('W %s %s %s' % (wedding_date, wedding_place, family.id))
return people
def export(self, filename):
with open(filename, 'w') as f:
DB.cur.execute('SELECT firstname, lastname, sex, birthdate, birthplace, deathdate, deathplace, permalink, family_id FROM people')
f.write('person,grampsid,firstname,lastname,gender,note,birthdate,birthplace,deathdate,deathplace\n')
people = DB.cur.fetchall()
for (firstname, lastname, sex, birthdate, birthplace, deathdate, deathplace, permalink, family_id) in people:
sex = 'male' if sex == 'M' else 'female' if sex == 'F' else ''
source = Process.base + permalink
f.write('%s,,"%s","%s",%s,%s,%s,"%s",%s,"%s"\n' % (permalink, firstname, lastname, sex, source, birthdate, birthplace, deathdate, deathplace))
f.write('\n\nmarriage,husband,wife,date,place,source\n')
DB.cur.execute('SELECT id, father_permalink, mother_permalink, wedding_date, wedding_place FROM family')
for (family_id, father_permalink, mother_permalink, wedding_date, wedding_place) in DB.cur.fetchall():
f.write('%s,%s,%s,%s,"%s"\n' % (family_id, father_permalink, mother_permalink, wedding_date or '', wedding_place or ''))
f.write('\n\nfamily,child\n')
for (firstname, lastname, sex, birthdate, birthplace, deathdate, deathplace, permalink, family_id) in people:
f.write('%s,%s\n' % (family_id or '', permalink))
if __name__ == '__main__':
DB.update()
process = Process('cache.json')
process.init_caches()
url = sys.argv[1] if len(sys.argv) > 1 else ''
if (url):
process.browse(url.replace(Process.base, ''))
DB.con.commit()
process.save_caches()
process.export('export.csv')
else:
print('Please provide a URL')