Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

get bills' meta data for 20th assembly #45

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 2 additions & 4 deletions bills/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,14 @@
import specific
import pdf

assembly_s, assembly_e = 17, 19 # start, end id of assembly
assembly_s, assembly_e = 20, 20 # start, end id of assembly
bill_s, bill_e = None, None # start, end number of bill

for a in range(assembly_s, assembly_e+1):
print '\n# Assembly %d' % a

print '## Get meta data'
npages = meta.get_npages(a)
meta.get_html(a, npages)
meta.html2csv(a, npages)
meta.html2csv(a)

print '## Get specific data'
specific.get_html(a, range=(bill_s, bill_e))
Expand Down
115 changes: 88 additions & 27 deletions bills/meta/csv.py
Original file line number Diff line number Diff line change
@@ -1,57 +1,118 @@
#! /usr/bin/python2.7
# -*- coding: utf-8 -*-

from __future__ import print_function
import os
from selenium import webdriver
from bs4 import BeautifulSoup

import re
import sys

import lxml
import utils
from settings import DIR, BASEURL, META_HEADERS, X

def html2csv(assembly_id, npages):
URL='http://likms.assembly.go.kr/bill/FinishBill.do'

def html2csv(assembly_id):

def list_to_file(l, f):
f.write('"')
f.write('","'.join(l).encode('utf-8'))
f.write('"\n')

def getContents(one_td):
result = ""
if one_td.string != None:
result = one_td.string.strip()
return result

def getBillId(one_href):
result = ""
splited = one_href.split("'")
if splited.count >= 2:
result = splited[1]
return result

def getTitileBillId(one_td):
return one_td.a['title'], getBillId(one_td.a.attrs['href'])

def parse_columns(columns):
data = []
for j, c in enumerate(columns):
if j==1:
status = str(int(\
re.findall(r'[0-9]+', c.xpath('img/@src')[0])[0]))
title = c.xpath('a/text()')[0].replace('"','\'')
link = re.findall(r'\w+', c.xpath('a/@href')[0])[2]
data.extend([status, title, link])
elif j==6:
data.append('1' if c.xpath('img/@onclick') else '0')
else:
data.append(c.xpath('text()')[0].strip())

aList = columns.find_all('td')
bill_id = getContents(aList[0]) # 의안번호
title, link_id = getTitileBillId(aList[1]) # 의안명, Link ID
proposer_type = getContents(aList[2]) # 제안자구분
proposed_date = getContents(aList[3]) # 제안일
#submitDt = getContents(aList[4]) # 회부일
#committeeName = getContents(aList[5]) # 소관위원회
decision_date = getContents(aList[6]) # 의결일자
decision_result = getContents(aList[7]) # 의결결과

status = ''
status_detail = ''
has_summaries = ''
if link_id != None:
has_summaries = '1'

data.extend([bill_id, status, title, link_id, proposer_type, proposed_date, decision_date, decision_result, has_summaries, status_detail])

return data

def parse_page(page, f, assembly_id):
fn = '%s/%s/%d.html' % (DIR['list'], assembly_id, page)
p = utils.read_webpage(fn)
rows = utils.get_elems(p, X['table'])
def parse_body_of_table(one_tbody, f):
for bdId in one_tbody.find_all('tr'):
p = parse_columns(bdId)
list_to_file(p, f)

def parse_page(page, f):
soup = BeautifulSoup(page, 'lxml')
table = soup.find(summary="검색결과의 의안번호, 의안명, 제안자구분, 제안일, 회부일, 소관위원회, 의결일자, 의결결과 정보")
parse_body_of_table(table.tbody, f)

def find_next_page(driver):
page_elem = driver.find_element_by_xpath("//div[@id='pageListViewArea']")
all_hrefs = page_elem.find_elements_by_tag_name("a")
for idx, href in enumerate(all_hrefs):
if href.get_attribute("class") == 'on':
if (idx + 1) < len(all_hrefs):
all_hrefs[idx + 1].click()
return True
return False

for r in reversed(rows):
columns = r.xpath(X['columns'])
if len(columns)==8:
p = parse_columns(columns)
list_to_file(p, f)
driver = webdriver.PhantomJS() # you must have phantomjs in your $PATH
driver.get(URL)

sys.stdout.write('%d\t' % page)
sys.stdout.flush()
wanted_age = str(assembly_id)

search_elem = driver.find_element_by_xpath("//select[@name='age']")
all_options = search_elem.find_elements_by_tag_name("option")
for option in all_options:
if option.get_attribute("value") == wanted_age:
if(option.get_attribute("selected") == None):
option.click();
break

search_elem = driver.find_element_by_xpath("//select[@title='의안종류선택']")
all_options = search_elem.find_elements_by_tag_name("option")
for option in all_options:
if option.get_attribute("value") == u'전체':
if(option.get_attribute("selected") == None):
option.click();
break

directory = DIR['meta']
utils.check_dir(directory)
meta_data = '%s/%d.csv' % (directory, assembly_id)

print '\nParsing:'
print('\nParsing:')
with open(meta_data, 'wa') as f:
list_to_file(META_HEADERS, f)
for page in range(1, npages+1):
parse_page(page, f, assembly_id)
parse_page(driver.page_source, f)
while find_next_page(driver) == True:
parse_page(driver.page_source, f)

driver.quit()

print '\nMeta data written to ' + meta_data
print('\nMeta data written to ' + meta_data)