Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix meetings, meetings_calendar crawler error #26

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
1 change: 0 additions & 1 deletion meetings/crawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ def checkdir(directory):
if not os.path.exists(directory):
os.makedirs(directory)


def get_filename(data, filetype):
if filetype=='json':
directory = jsondir
Expand Down
Empty file modified meetings/parse.py
100755 → 100644
Empty file.
188 changes: 106 additions & 82 deletions meetings_calendar/get.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
# -*- coding: utf-8 -*-

import os
import io
import urllib2
import html5lib
import datetime
Expand All @@ -11,101 +10,126 @@

base_url = 'http://www.assembly.go.kr/renew10/anc/schedule/assm/assemact/council/council0101/assmSchCal/assemSchCalInfoAjax.do?currentPage=&movePageNum=&rowPerPage=1000&gubun=&agendaid=&committee_id=&board_id=&record_id=&returnPage=&weekday=&today=&calendarMove=&showDt=&meetingday=%s'

link_url = 'http://www.assembly.go.kr/renew10/anc/schedule/assm/assemact/council/council0101/assmSchCal/assemScanCalDetail.do?gubun=%s&agendaid=%s&committee_id=%s&board_id=%s&record_id=%s'
assem_url = 'http://www.assembly.go.kr/renew10/anc/schedule/assm/assemact/council/council0101/assmSchCal/assemScanCalDetail.do?gubun={gubun}&agendaid={agenda_id}'
cmmtt_url = 'http://www.assembly.go.kr/renew10/anc/schedule/assm/assemact/council/council0101/assmSchCal/assemScanCalDetail.do?gubun={gubun}&committee_id={committee_id}&board_id={board_id}&record_id={record_id}'

sources_dir = './sources'

header = '"date","time","type","title","session","sitting","committee","url"\n'

xpath_title = '//a[contains(@onclick, "jsDetail")]/text()'
xpath_link_params = '//a[contains(@onclick, "jsDetail")]/@onclick'
xpath_datetime = '//dd/text()'
xpath_committee = '//dd/span/text()'
xpath_datetime_committee = '//dd'


def is_dashed(str):
if str.count('-') > 0:
return True
else:
return False
if str.count('-') > 0:
return True
else:
return False


def crawl(url, directory, filename):
if not os.path.exists(directory):
os.makedirs(directory)
if not os.path.exists(directory):
os.makedirs(directory)

r = urllib2.urlopen(url)
with open('%s/%s.html' % (directory, filename), 'w+') as f:
f.write(r.read())

r = urllib2.urlopen(url)
with open('%s/%s.html' % (directory, filename), 'w') as f:
f.write(r.read())

def get_webpage(f):
page = html5lib.HTMLParser(\
tree=html5lib.treebuilders.getTreeBuilder("lxml"),\
namespaceHTMLElements=False)
p = page.parse(f, encoding='utf-8')
return p
page = html5lib.HTMLParser(
tree=html5lib.treebuilders.getTreeBuilder("lxml"),
namespaceHTMLElements=False)
p = page.parse(f, encoding='utf-8')
return p


def get_link_url(gubun, agenda_id, committee_id, board_id, record_id):
if committee_id == '$schedule.committee_id'\
and board_id == '$schedule.board_id'\
and record_id == '$schedule.record_id':
return assem_url.format(gubun=gubun, agenda_id=agenda_id)

return cmmtt_url.format(gubun=gubun, committee_id=committee_id, board_id=board_id, record_id=record_id)

def get_link_url(gubun, agendaid, committee_id, board_id, record_id):
return (link_url % (gubun, agendaid, committee_id, board_id, record_id))

def parse_meeting_schedule(filename):
date_length = len('0000-00-00') + 1

session_re = re.compile(u'제(?P<session>[0-9]+)회')
sitting_re = re.compile(u'제(?P<sitting>[0-9]+)차')

with open(filename, 'r') as f:
p = get_webpage(f)

raw_titles = p.xpath(xpath_title)[0:]
link_params = p.xpath(xpath_link_params)[0:]
datetimes = p.xpath(xpath_datetime)[0:]
committes = p.xpath(xpath_committee)[0:]

datetimes = [datetime for datetime in datetimes if datetime.strip() != '']
link_params = [link_param.replace('jsDetail(', '').replace(');return false;', '') for link_param in link_params]

dates = [datetime[:date_length].strip() for datetime in datetimes]
times = [datetime[date_length:].strip() for datetime in datetimes]
types = [title[title.find('[')+1:title.find(']')] for title in raw_titles]
titles = [title[title.find(']')+2:] for title in raw_titles]
sessions = [session_re.findall(title)[0] for title in titles]
sittings = [sitting_re.findall(title)[0] for title in titles]
links = [eval('get_link_url(%s)' % link_param) for link_param in link_params]

return zip(dates, times, types, titles, sessions, sittings, committes, links)

def get_meeting_list(start, end):
if is_dashed(start):
start = start.replace('-', '')

if is_dashed(end):
end = end.replace('-', '')

startDt = datetime.datetime.strptime(start, '%Y%m%d').date()
endDt = datetime.datetime.strptime(end, '%Y%m%d').date()

td = datetime.timedelta(days=1)

csv_filename = 'meetings_%s_%s.csv' % (start, end)

with open('%s/%s' % (sources_dir, csv_filename), 'wa') as f:
f.write(header.encode('utf-8'))
while startDt <= endDt:
filename = str(startDt).replace('-', '')
crawl(('%s' % base_url) % filename, sources_dir, filename)
result = parse_meeting_schedule(('%s/%s.html' % (sources_dir, filename)))
f.write('\n'.join(\
['"%s","%s","%s","%s","%s","%s","%s","%s"' % (date, time, type, title, session, sitting, committee, link) for date, time, type, title, session, sitting, committee, link in result]
).encode('utf-8'))
f.write('\n')
startDt = startDt + td

print 'parsed to %s' % csv_filename

if __name__=='__main__':
if len(sys.argv) is 1:
print 'usage: python get.py YYYY-MM-DD YYYY-MM-DD'
print ' python get.py YYYY-MM-DD'
elif len(sys.argv) is 2:
get_meeting_list(sys.argv[1], sys.argv[1])
elif len(sys.argv) is 3:
get_meeting_list(sys.argv[1], sys.argv[2])
date_length = len('0000-00-00') + 1

session_re = re.compile(u'제(?P<session>[0-9]+)회')
sitting_re = re.compile(u'제(?P<sitting>[0-9]+)차')

with open(filename, 'r') as f:
p = get_webpage(f)

raw_titles = p.xpath(xpath_title)[0:]
link_params = p.xpath(xpath_link_params)[0:]
datetimes = p.xpath(xpath_datetime_committee)[0:]
committees = p.xpath(xpath_datetime_committee)[0:]

datetimes = [datetime.text for datetime in datetimes if datetime.text.strip() != '']
committees = ['' if committee.findtext('span') is None else committee.findtext('span') for committee in committees]
link_params = [link_param.replace('jsDetail(', '').replace(');return false;', '') for link_param in link_params]

dates = [datetime[:date_length].strip() for datetime in datetimes]
times = [datetime[date_length:].strip() for datetime in datetimes]
types = [title[title.find('[') + 1:title.find(']')] for title in raw_titles]
titles = [title[title.find(']') + 2:] for title in raw_titles]
sessions = [session_re.findall(title)[0] for title in titles]
sittings = [sitting_re.findall(title)[0] for title in titles]
links = [eval('get_link_url(%s)' % link_param) for link_param in link_params]

return zip(dates, times, types, titles, sessions, sittings, committees, links)


def get_meeting_list(start, end=None):
if is_dashed(start):
start = start.replace('-', '')

startdt = datetime.datetime.strptime(start, '%Y%m%d').date()

if end is None:
enddt = startdt
csv_filename = 'meetings_%s.csv' % start
else:
if is_dashed(end):
end = end.replace('-', '')
enddt = datetime.datetime.strptime(end, '%Y%m%d').date()
csv_filename = 'meetings_%s_%s.csv' % (start, end)

td = datetime.timedelta(days=1)

if not os.path.exists(sources_dir):
os.makedirs(sources_dir)

with open('%s/%s' % (sources_dir, csv_filename), 'w+') as f:
f.write(header.encode('utf-8'))
while startdt <= enddt:
filename = str(startdt).replace('-', '')
crawl(('%s' % base_url) % filename, sources_dir, filename)
result = parse_meeting_schedule(('%s/%s.html' % (sources_dir, filename)))

startdt = startdt + td

if len(result) == 0:
continue

f.write('\n'.join(
['"%s","%s","%s","%s","%s","%s","%s","%s"' % (
date, time, type, title, session, sitting, committee, link) for
date, time, type, title, session, sitting, committee, link in result]
).encode('utf-8'))
f.write('\n')

print 'parsed to %s' % csv_filename

if __name__ == '__main__':
if len(sys.argv) is 1:
print 'usage: python get.py YYYY-MM-DD YYYY-MM-DD'
print ' python get.py YYYY-MM-DD'
elif len(sys.argv) is 2:
get_meeting_list(sys.argv[1])
elif len(sys.argv) is 3:
get_meeting_list(sys.argv[1], sys.argv[2])
Empty file modified naver_news/crawl.py
100755 → 100644
Empty file.