support for html and xbrl downloads

greedo · Dec 29, 2014 · 57ee031 · 57ee031
1 parent 539407d
commit 57ee031
Show file tree

Hide file tree

Showing 4 changed files with 48 additions and 18 deletions.
diff --git a/download.py b/download.py
@@ -0,0 +1 @@
+examples/download.py
diff --git a/examples/download.py b/examples/download.py
@@ -0,0 +1,21 @@
+#! /usr/bin/env python
+# encoding: utf-8
+
+from ingestor import Ingestor, Edgar, Sedar
+import os
+
+ingestor = Ingestor()
+
+# xbrl or html?
+edgar = Edgar("xbrl")
+
+docs_directory = "test"
+
+# if the directory we will download files does not exist, create it
+if not os.path.exists(docs_directory):
+    os.mkdir(docs_directory)
+
+# for every ticker in our input file, download all the relevant documents
+with open('data.txt', 'r') as reader:
+    for line in reader:
+        ingestor.file_downloader(edgar.ingest_stock(line.rstrip()), docs_directory)
diff --git a/ingestor.py b/ingestor.py
@@ -8,7 +8,7 @@
 import concurrent.futures
 import requests
 import re
-from datetime import datetime, date, time
+import datetime
 
 try:
     import cStringIO as StringIO
@@ -95,21 +95,25 @@ def ingest_stock(self, ticker):
 
         # utf-8
         processed = feed.text.encode('utf-8')
-        print processed
         try:
             root = ET.fromstring(processed)
         except ET.ParseError:
             return
         print root
 
 
-
 class Edgar():
     """
     EDGAR is document filing and retrieval system used by the SEC (US)
     """
 
-    def __init__(self, start_date=None, end_date=None):
+    filing_types = ['10-K', '10-Q']
+    doc_types = {
+        'html': ["Document Format Files", ".htm", filing_types],
+        'xbrl': ["Data Files", ".xml", "XBRL INSTANCE DOCUMENT"]
+        }
+
+    def __init__(self, doc_type=None, start_date=None, end_date=None):
         self.org_root = "http://www.sec.gov"
 
         if start_date is None:
@@ -122,17 +126,22 @@ def __init__(self, start_date=None, end_date=None):
         else:
             self.end_date = end_date
 
-    def html_search(self, tree, types):
+        if doc_type == "html":
+            self.doc_type = Edgar.doc_types['html']
+        else:
+            self.doc_type = Edgar.doc_types['xbrl']
+
+    def page_search(self, tree, types):
         """
-        html_search finds the document url in the document listing html.
+        page_search finds the document url in the document listing file links.
         """
 
         grab_next = False
         tables = list(tree.iter("table"))
 
         try:
             for table in tables:
-                if table.attrib['summary'] == "Document Format Files":
+                if table.attrib['summary'] == self.doc_type[0]:
                     for row in table.findall('tr'):
                         for column in row.findall('td'):
                             if grab_next:
@@ -141,7 +150,7 @@ def html_search(self, tree, types):
                                     return link.attrib['href']
                                     break
                                 grab_next = False
-                            if column.text == types:
+                            if column.text in self.doc_type[2]:
                                 grab_next = True
         except UnicodeEncodeError:
             pass
@@ -152,10 +161,9 @@ def ingest_stock(self, ticker):
         It uses a ticker or keyword.
         """
 
-        doc_types = ['10-K', '10-Q']
         to_parse = []
 
-        for types in doc_types:
+        for types in self.doc_type[2]:
             feed = requests.get(self.org_root+'/cgi-bin/browse-edgar', params={'action': 'getcompany', 'CIK': ticker, 'type': types, 'count': 200, 'output': 'atom'})
 
             # iso-8859-1 -> utf-8
@@ -173,8 +181,8 @@ def ingest_stock(self, ticker):
                 output = StringIO.StringIO(requests.get(html_url).text.encode('ascii','ignore'))
                 tree = etree.parse(output, etree.HTMLParser())
 
-                output = self.html_search(tree, types)
-                if output and ".htm" in output:
+                output = self.page_search(tree, types)
+                if output and self.doc_type[1] in output:
                     to_parse.append(self.org_root+output)
 
         return to_parse
diff --git a/main.py b/main.py
@@ -26,16 +26,16 @@ def quit_gracefully(*args):
 # note the indexer thread is set to daemon causing it to terminate on a SIGINT
 indexer = Indexer(queryer.store_dir, queryer.writer, directoryToWalk)
 ingestor = Ingestor()
-edgar = Edgar()
-#sedar = Sedar()
+#edgar = Edgar()
+sedar = Sedar()
 
 with open('data.txt', 'r') as reader:
     for line in reader:
-        ingestor.file_downloader(edgar.ingest_stock(line.rstrip()), directoryToWalk)
-        indexer.indexDocs()
+        ingestor.file_downloader(sedar.ingest_stock(line.rstrip()), directoryToWalk)
+        #indexer.indexDocs()
 
 # start up the terminal query interface
-queryer.run(queryer.writer, queryer.analyzer)
+#queryer.run(queryer.writer, queryer.analyzer)
 
 # if return from Querying then call the signal handler to clean up the writer cleanly
-quit_gracefully()
+#quit_gracefully()