Skip to content

Commit

Permalink
support for html and xbrl downloads
Browse files Browse the repository at this point in the history
  • Loading branch information
greedo committed Dec 29, 2014
1 parent 539407d commit 57ee031
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 18 deletions.
1 change: 1 addition & 0 deletions download.py
21 changes: 21 additions & 0 deletions examples/download.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#! /usr/bin/env python
# encoding: utf-8

from ingestor import Ingestor, Edgar, Sedar
import os

ingestor = Ingestor()

# xbrl or html?
edgar = Edgar("xbrl")

docs_directory = "test"

# if the directory we will download files does not exist, create it
if not os.path.exists(docs_directory):
os.mkdir(docs_directory)

# for every ticker in our input file, download all the relevant documents
with open('data.txt', 'r') as reader:
for line in reader:
ingestor.file_downloader(edgar.ingest_stock(line.rstrip()), docs_directory)
32 changes: 20 additions & 12 deletions ingestor.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import concurrent.futures
import requests
import re
from datetime import datetime, date, time
import datetime

try:
import cStringIO as StringIO
Expand Down Expand Up @@ -95,21 +95,25 @@ def ingest_stock(self, ticker):

# utf-8
processed = feed.text.encode('utf-8')
print processed
try:
root = ET.fromstring(processed)
except ET.ParseError:
return
print root



class Edgar():
"""
EDGAR is document filing and retrieval system used by the SEC (US)
"""

def __init__(self, start_date=None, end_date=None):
filing_types = ['10-K', '10-Q']
doc_types = {
'html': ["Document Format Files", ".htm", filing_types],
'xbrl': ["Data Files", ".xml", "XBRL INSTANCE DOCUMENT"]
}

def __init__(self, doc_type=None, start_date=None, end_date=None):
self.org_root = "http://www.sec.gov"

if start_date is None:
Expand All @@ -122,17 +126,22 @@ def __init__(self, start_date=None, end_date=None):
else:
self.end_date = end_date

def html_search(self, tree, types):
if doc_type == "html":
self.doc_type = Edgar.doc_types['html']
else:
self.doc_type = Edgar.doc_types['xbrl']

def page_search(self, tree, types):
"""
html_search finds the document url in the document listing html.
page_search finds the document url in the document listing file links.
"""

grab_next = False
tables = list(tree.iter("table"))

try:
for table in tables:
if table.attrib['summary'] == "Document Format Files":
if table.attrib['summary'] == self.doc_type[0]:
for row in table.findall('tr'):
for column in row.findall('td'):
if grab_next:
Expand All @@ -141,7 +150,7 @@ def html_search(self, tree, types):
return link.attrib['href']
break
grab_next = False
if column.text == types:
if column.text in self.doc_type[2]:
grab_next = True
except UnicodeEncodeError:
pass
Expand All @@ -152,10 +161,9 @@ def ingest_stock(self, ticker):
It uses a ticker or keyword.
"""

doc_types = ['10-K', '10-Q']
to_parse = []

for types in doc_types:
for types in self.doc_type[2]:
feed = requests.get(self.org_root+'/cgi-bin/browse-edgar', params={'action': 'getcompany', 'CIK': ticker, 'type': types, 'count': 200, 'output': 'atom'})

# iso-8859-1 -> utf-8
Expand All @@ -173,8 +181,8 @@ def ingest_stock(self, ticker):
output = StringIO.StringIO(requests.get(html_url).text.encode('ascii','ignore'))
tree = etree.parse(output, etree.HTMLParser())

output = self.html_search(tree, types)
if output and ".htm" in output:
output = self.page_search(tree, types)
if output and self.doc_type[1] in output:
to_parse.append(self.org_root+output)

return to_parse
12 changes: 6 additions & 6 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,16 +26,16 @@ def quit_gracefully(*args):
# note the indexer thread is set to daemon causing it to terminate on a SIGINT
indexer = Indexer(queryer.store_dir, queryer.writer, directoryToWalk)
ingestor = Ingestor()
edgar = Edgar()
#sedar = Sedar()
#edgar = Edgar()
sedar = Sedar()

with open('data.txt', 'r') as reader:
for line in reader:
ingestor.file_downloader(edgar.ingest_stock(line.rstrip()), directoryToWalk)
indexer.indexDocs()
ingestor.file_downloader(sedar.ingest_stock(line.rstrip()), directoryToWalk)
#indexer.indexDocs()

# start up the terminal query interface
queryer.run(queryer.writer, queryer.analyzer)
#queryer.run(queryer.writer, queryer.analyzer)

# if return from Querying then call the signal handler to clean up the writer cleanly
quit_gracefully()
#quit_gracefully()

0 comments on commit 57ee031

Please sign in to comment.