From 9e85b3a2d3fd7f8940e12fcbb9205a4c7d457b1b Mon Sep 17 00:00:00 2001 From: Youngkyoung Lee Date: Thu, 20 Nov 2014 15:55:23 +0900 Subject: [PATCH 1/4] fix filename encoding error --- meetings/crawl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/meetings/crawl.py b/meetings/crawl.py index b85b458..bfd51bc 100755 --- a/meetings/crawl.py +++ b/meetings/crawl.py @@ -23,7 +23,7 @@ def checkdir(directory): def get_html(page_num): url = '%s/new/new_list.jsp?CLASS_CODE=0¤tPage=%d' % (baseurl, page_num) r = requests.get(url) - return r.text.encode('utf-8') + return unicode(r.text.encode('utf-8'), 'utf-8') def get_hidden_url(url): r = requests.get(url) From 57f9ee82e0ca9fb9dc3abbe0e34e8e9201da3bdd Mon Sep 17 00:00:00 2001 From: Youngkyoung Lee Date: Thu, 20 Nov 2014 16:26:35 +0900 Subject: [PATCH 2/4] Fix meeting_calander crawler error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 저장하는 경로가 생성되어 있지 않을 때 동작하지 않던 문제 수정 - 의사일정 검색한 일자에 의사일정이 하나도 없는 경우 결과 파일에 빈 줄이 들어가는 문제 수정 --- meetings_calendar/get.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/meetings_calendar/get.py b/meetings_calendar/get.py index 84489f5..cabfabc 100644 --- a/meetings_calendar/get.py +++ b/meetings_calendar/get.py @@ -87,17 +87,24 @@ def get_meeting_list(start, end): csv_filename = 'meetings_%s_%s.csv' % (start, end) + if not os.path.exists(sources_dir): + os.makedirs(sources_dir) + with open('%s/%s' % (sources_dir, csv_filename), 'wa') as f: f.write(header.encode('utf-8')) while startDt <= endDt: filename = str(startDt).replace('-', '') crawl(('%s' % base_url) % filename, sources_dir, filename) result = parse_meeting_schedule(('%s/%s.html' % (sources_dir, filename))) + + startDt = startDt + td + if len(result) == 0: + continue + f.write('\n'.join(\ ['"%s","%s","%s","%s","%s","%s","%s","%s"' % (date, time, type, title, session, sitting, committee, link) for date, time, type, title, session, sitting, committee, link in result] ).encode('utf-8')) f.write('\n') - startDt = startDt + td print 'parsed to %s' % csv_filename From 548a455a3eb1adf9ce73e30a0ab4cffc06da12db Mon Sep 17 00:00:00 2001 From: majorika Date: Sat, 27 Feb 2016 23:29:04 +0900 Subject: [PATCH 3/4] fix datetime, committee parsing error --- meetings_calendar/get.py | 185 +++++++++++++++++++++------------------ 1 file changed, 98 insertions(+), 87 deletions(-) diff --git a/meetings_calendar/get.py b/meetings_calendar/get.py index cabfabc..9f5f523 100644 --- a/meetings_calendar/get.py +++ b/meetings_calendar/get.py @@ -2,7 +2,6 @@ # -*- coding: utf-8 -*- import os -import io import urllib2 import html5lib import datetime @@ -19,100 +18,112 @@ xpath_title = '//a[contains(@onclick, "jsDetail")]/text()' xpath_link_params = '//a[contains(@onclick, "jsDetail")]/@onclick' -xpath_datetime = '//dd/text()' -xpath_committee = '//dd/span/text()' +xpath_datetime_committee = '//dd' -def is_dashed(str): - if str.count('-') > 0: - return True - else: - return False - -def crawl(url, directory, filename): - if not os.path.exists(directory): - os.makedirs(directory) - - r = urllib2.urlopen(url) - with open('%s/%s.html' % (directory, filename), 'w') as f: - f.write(r.read()) - -def get_webpage(f): - page = html5lib.HTMLParser(\ - tree=html5lib.treebuilders.getTreeBuilder("lxml"),\ - namespaceHTMLElements=False) - p = page.parse(f, encoding='utf-8') - return p - -def get_link_url(gubun, agendaid, committee_id, board_id, record_id): - return (link_url % (gubun, agendaid, committee_id, board_id, record_id)) - -def parse_meeting_schedule(filename): - date_length = len('0000-00-00') + 1 - - session_re = re.compile(u'제(?P[0-9]+)회') - sitting_re = re.compile(u'제(?P[0-9]+)차') - with open(filename, 'r') as f: - p = get_webpage(f) - - raw_titles = p.xpath(xpath_title)[0:] - link_params = p.xpath(xpath_link_params)[0:] - datetimes = p.xpath(xpath_datetime)[0:] - committes = p.xpath(xpath_committee)[0:] - - datetimes = [datetime for datetime in datetimes if datetime.strip() != ''] - link_params = [link_param.replace('jsDetail(', '').replace(');return false;', '') for link_param in link_params] - - dates = [datetime[:date_length].strip() for datetime in datetimes] - times = [datetime[date_length:].strip() for datetime in datetimes] - types = [title[title.find('[')+1:title.find(']')] for title in raw_titles] - titles = [title[title.find(']')+2:] for title in raw_titles] - sessions = [session_re.findall(title)[0] for title in titles] - sittings = [sitting_re.findall(title)[0] for title in titles] - links = [eval('get_link_url(%s)' % link_param) for link_param in link_params] - - return zip(dates, times, types, titles, sessions, sittings, committes, links) - -def get_meeting_list(start, end): - if is_dashed(start): - start = start.replace('-', '') - - if is_dashed(end): - end = end.replace('-', '') +def is_dashed(str): + if str.count('-') > 0: + return True + else: + return False - startDt = datetime.datetime.strptime(start, '%Y%m%d').date() - endDt = datetime.datetime.strptime(end, '%Y%m%d').date() - td = datetime.timedelta(days=1) +def crawl(url, directory, filename): + if not os.path.exists(directory): + os.makedirs(directory) - csv_filename = 'meetings_%s_%s.csv' % (start, end) + r = urllib2.urlopen(url) + with open('%s/%s.html' % (directory, filename), 'w+') as f: + f.write(r.read()) - if not os.path.exists(sources_dir): - os.makedirs(sources_dir) - with open('%s/%s' % (sources_dir, csv_filename), 'wa') as f: - f.write(header.encode('utf-8')) - while startDt <= endDt: - filename = str(startDt).replace('-', '') - crawl(('%s' % base_url) % filename, sources_dir, filename) - result = parse_meeting_schedule(('%s/%s.html' % (sources_dir, filename))) +def get_webpage(f): + page = html5lib.HTMLParser( + tree=html5lib.treebuilders.getTreeBuilder("lxml"), + namespaceHTMLElements=False) + p = page.parse(f, encoding='utf-8') + return p - startDt = startDt + td - if len(result) == 0: - continue - f.write('\n'.join(\ - ['"%s","%s","%s","%s","%s","%s","%s","%s"' % (date, time, type, title, session, sitting, committee, link) for date, time, type, title, session, sitting, committee, link in result] - ).encode('utf-8')) - f.write('\n') +def get_link_url(gubun, agendaid, committee_id, board_id, record_id): + return (link_url % (gubun, agendaid, committee_id, board_id, record_id)) - print 'parsed to %s' % csv_filename -if __name__=='__main__': - if len(sys.argv) is 1: - print 'usage: python get.py YYYY-MM-DD YYYY-MM-DD' - print ' python get.py YYYY-MM-DD' - elif len(sys.argv) is 2: - get_meeting_list(sys.argv[1], sys.argv[1]) - elif len(sys.argv) is 3: - get_meeting_list(sys.argv[1], sys.argv[2]) +def parse_meeting_schedule(filename): + date_length = len('0000-00-00') + 1 + + session_re = re.compile(u'제(?P[0-9]+)회') + sitting_re = re.compile(u'제(?P[0-9]+)차') + + with open(filename, 'r') as f: + p = get_webpage(f) + + raw_titles = p.xpath(xpath_title)[0:] + link_params = p.xpath(xpath_link_params)[0:] + datetimes = p.xpath(xpath_datetime_committee)[0:] + committees = p.xpath(xpath_datetime_committee)[0:] + + datetimes = [datetime.text for datetime in datetimes if datetime.text.strip() != ''] + committees = ['' if committee.findtext('span') is None else committee.findtext('span') for committee in committees] + link_params = [link_param.replace('jsDetail(', '').replace(');return false;', '') for link_param in link_params] + + dates = [datetime[:date_length].strip() for datetime in datetimes] + times = [datetime[date_length:].strip() for datetime in datetimes] + types = [title[title.find('[') + 1:title.find(']')] for title in raw_titles] + titles = [title[title.find(']') + 2:] for title in raw_titles] + sessions = [session_re.findall(title)[0] for title in titles] + sittings = [sitting_re.findall(title)[0] for title in titles] + links = [eval('get_link_url(%s)' % link_param) for link_param in link_params] + + return zip(dates, times, types, titles, sessions, sittings, committees, links) + + +def get_meeting_list(start, end=None): + if is_dashed(start): + start = start.replace('-', '') + + startdt = datetime.datetime.strptime(start, '%Y%m%d').date() + + if end is None: + enddt = startdt + csv_filename = 'meetings_%s.csv' % start + else: + if is_dashed(end): + end = end.replace('-', '') + enddt = datetime.datetime.strptime(end, '%Y%m%d').date() + csv_filename = 'meetings_%s_%s.csv' % (start, end) + + td = datetime.timedelta(days=1) + + if not os.path.exists(sources_dir): + os.makedirs(sources_dir) + + with open('%s/%s' % (sources_dir, csv_filename), 'w+') as f: + f.write(header.encode('utf-8')) + while startdt <= enddt: + filename = str(startdt).replace('-', '') + crawl(('%s' % base_url) % filename, sources_dir, filename) + result = parse_meeting_schedule(('%s/%s.html' % (sources_dir, filename))) + + startdt = startdt + td + + if len(result) == 0: + continue + + f.write('\n'.join( + ['"%s","%s","%s","%s","%s","%s","%s","%s"' % ( + date, time, type, title, session, sitting, committee, link) for + date, time, type, title, session, sitting, committee, link in result] + ).encode('utf-8')) + f.write('\n') + + print 'parsed to %s' % csv_filename + +if __name__ == '__main__': + if len(sys.argv) is 1: + print 'usage: python get.py YYYY-MM-DD YYYY-MM-DD' + print ' python get.py YYYY-MM-DD' + elif len(sys.argv) is 2: + get_meeting_list(sys.argv[1]) + elif len(sys.argv) is 3: + get_meeting_list(sys.argv[1], sys.argv[2]) From 2e0a198650956b289cf31b97a51c9234a1e8b7f2 Mon Sep 17 00:00:00 2001 From: majorika Date: Sat, 27 Feb 2016 23:51:07 +0900 Subject: [PATCH 4/4] fix meeting_calendar url field --- meetings_calendar/get.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/meetings_calendar/get.py b/meetings_calendar/get.py index 9f5f523..f7f70fd 100644 --- a/meetings_calendar/get.py +++ b/meetings_calendar/get.py @@ -10,7 +10,8 @@ base_url = 'http://www.assembly.go.kr/renew10/anc/schedule/assm/assemact/council/council0101/assmSchCal/assemSchCalInfoAjax.do?currentPage=&movePageNum=&rowPerPage=1000&gubun=&agendaid=&committee_id=&board_id=&record_id=&returnPage=&weekday=&today=&calendarMove=&showDt=&meetingday=%s' -link_url = 'http://www.assembly.go.kr/renew10/anc/schedule/assm/assemact/council/council0101/assmSchCal/assemScanCalDetail.do?gubun=%s&agendaid=%s&committee_id=%s&board_id=%s&record_id=%s' +assem_url = 'http://www.assembly.go.kr/renew10/anc/schedule/assm/assemact/council/council0101/assmSchCal/assemScanCalDetail.do?gubun={gubun}&agendaid={agenda_id}' +cmmtt_url = 'http://www.assembly.go.kr/renew10/anc/schedule/assm/assemact/council/council0101/assmSchCal/assemScanCalDetail.do?gubun={gubun}&committee_id={committee_id}&board_id={board_id}&record_id={record_id}' sources_dir = './sources' @@ -45,8 +46,13 @@ def get_webpage(f): return p -def get_link_url(gubun, agendaid, committee_id, board_id, record_id): - return (link_url % (gubun, agendaid, committee_id, board_id, record_id)) +def get_link_url(gubun, agenda_id, committee_id, board_id, record_id): + if committee_id == '$schedule.committee_id'\ + and board_id == '$schedule.board_id'\ + and record_id == '$schedule.record_id': + return assem_url.format(gubun=gubun, agenda_id=agenda_id) + + return cmmtt_url.format(gubun=gubun, committee_id=committee_id, board_id=board_id, record_id=record_id) def parse_meeting_schedule(filename):