teampopong · bimandmi · Nov 3, 2017
diff --git a/bills/main.py b/bills/main.py
@@ -5,16 +5,14 @@
 import specific
 import pdf
 
-assembly_s, assembly_e = 17, 19 # start, end id of assembly
+assembly_s, assembly_e = 20, 20 # start, end id of assembly
 bill_s, bill_e = None, None     # start, end number of bill
 
 for a in range(assembly_s, assembly_e+1):
     print '\n# Assembly %d' % a
 
     print '## Get meta data'
-    npages = meta.get_npages(a)
-    meta.get_html(a, npages)
-    meta.html2csv(a, npages)
+    meta.html2csv(a)
 
     print '## Get specific data'
     specific.get_html(a, range=(bill_s, bill_e))

diff --git a/bills/meta/csv.py b/bills/meta/csv.py
@@ -1,57 +1,118 @@
 #! /usr/bin/python2.7
 # -*- coding: utf-8 -*-
 
+from __future__ import print_function
+import os
+from selenium import webdriver
+from bs4 import BeautifulSoup
+
 import re
 import sys
 
 import lxml
 import utils
 from settings import DIR, BASEURL, META_HEADERS, X
 
-def html2csv(assembly_id, npages):
+URL='http://likms.assembly.go.kr/bill/FinishBill.do'
+
+def html2csv(assembly_id):
 
     def list_to_file(l, f):
         f.write('"')
         f.write('","'.join(l).encode('utf-8'))
         f.write('"\n')
 
+    def getContents(one_td):
+        result = ""
+        if one_td.string != None:
+            result = one_td.string.strip()
+        return result
+
+    def getBillId(one_href):
+        result = ""
+        splited = one_href.split("'")
+        if splited.count >= 2:
+            result = splited[1]
+        return result
+
+    def getTitileBillId(one_td):
+        return one_td.a['title'], getBillId(one_td.a.attrs['href'])
+
     def parse_columns(columns):
         data = []
-        for j, c in enumerate(columns):
-            if j==1:
-                status = str(int(\
-                       re.findall(r'[0-9]+', c.xpath('img/@src')[0])[0]))
-                title = c.xpath('a/text()')[0].replace('"','\'')
-                link = re.findall(r'\w+', c.xpath('a/@href')[0])[2]
-                data.extend([status, title, link])
-            elif j==6:
-                data.append('1' if c.xpath('img/@onclick') else '0')
-            else:
-                data.append(c.xpath('text()')[0].strip())
+
+        aList = columns.find_all('td')
+        bill_id         = getContents(aList[0])     # 의안번호
+        title, link_id  = getTitileBillId(aList[1]) # 의안명, Link ID
+        proposer_type   = getContents(aList[2])     # 제안자구분
+        proposed_date   = getContents(aList[3])     # 제안일
+        #submitDt        = getContents(aList[4])     # 회부일
+        #committeeName   = getContents(aList[5])     # 소관위원회
+        decision_date   = getContents(aList[6])     # 의결일자
+        decision_result = getContents(aList[7])     # 의결결과
+
+        status = ''
+        status_detail = ''
+        has_summaries = ''
+        if link_id != None:
+            has_summaries = '1'
+
+        data.extend([bill_id, status, title, link_id, proposer_type, proposed_date, decision_date, decision_result, has_summaries, status_detail])
+
         return data
 
-    def parse_page(page, f, assembly_id):
-        fn = '%s/%s/%d.html' % (DIR['list'], assembly_id, page)
-        p = utils.read_webpage(fn)
-        rows = utils.get_elems(p, X['table'])
+    def parse_body_of_table(one_tbody, f):
+        for bdId in one_tbody.find_all('tr'):
+            p = parse_columns(bdId)
+            list_to_file(p, f)
+
+    def parse_page(page, f):
+        soup = BeautifulSoup(page, 'lxml')
+        table = soup.find(summary="검색결과의 의안번호, 의안명, 제안자구분, 제안일, 회부일, 소관위원회, 의결일자, 의결결과 정보")
+        parse_body_of_table(table.tbody, f)
+
+    def find_next_page(driver):
+        page_elem = driver.find_element_by_xpath("//div[@id='pageListViewArea']")
+        all_hrefs = page_elem.find_elements_by_tag_name("a")
+        for idx, href in enumerate(all_hrefs):
+            if href.get_attribute("class") == 'on':
+                if (idx + 1) < len(all_hrefs):
+                    all_hrefs[idx + 1].click()
+                    return True
+        return False
 
-        for r in reversed(rows):
-            columns = r.xpath(X['columns'])
-            if len(columns)==8:
-                p = parse_columns(columns)
-                list_to_file(p, f)
+    driver = webdriver.PhantomJS() # you must have phantomjs in your $PATH
+    driver.get(URL)
 
-        sys.stdout.write('%d\t' % page)
-        sys.stdout.flush()
+    wanted_age = str(assembly_id)
+
+    search_elem = driver.find_element_by_xpath("//select[@name='age']")
+    all_options = search_elem.find_elements_by_tag_name("option")
+    for option in all_options:
+        if option.get_attribute("value") == wanted_age:
+            if(option.get_attribute("selected") == None):
+                option.click();
+                break
+
+    search_elem = driver.find_element_by_xpath("//select[@title='의안종류선택']")
+    all_options = search_elem.find_elements_by_tag_name("option")
+    for option in all_options:
+        if option.get_attribute("value") == u'전체':
+            if(option.get_attribute("selected") == None):
+                option.click();
+                break
 
     directory = DIR['meta']
     utils.check_dir(directory)
     meta_data = '%s/%d.csv' % (directory, assembly_id)
 
-    print '\nParsing:'
+    print('\nParsing:')
     with open(meta_data, 'wa') as f:
         list_to_file(META_HEADERS, f)
-        for page in range(1, npages+1):
-            parse_page(page, f, assembly_id)
+        parse_page(driver.page_source, f)
+        while find_next_page(driver) == True:
+            parse_page(driver.page_source, f)
+
+    driver.quit()
 
-    print '\nMeta data written to ' + meta_data
+    print('\nMeta data written to ' + meta_data)