-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprocess_pdf.py
160 lines (133 loc) · 5.57 KB
/
process_pdf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import pdfplumber
import re
import os
import yaml
import db_engine
import logging
logging.basicConfig()
logging.getLogger('sqlalchemy.engine').setLevel(logging.WARNING)
column_names = re.compile(r'ID Owner Asset Transaction Date Notification Amount Cap.\nType Date Gains >\n\$200\?')
separator = re.compile(r'\nF S: New\n(?:L: .*\n|D: .*\n|C: .*\n|S O: .*\n)*')
stock_pattern = r'^(?:JT|SP)?(.+?)\s+(S \(partial\)|S|P)\s+'
date_pattern = r'(\d{2}/\d{2}/\d{4})'
amount_pattern = r'\$(\d{1,3}(?:,\d{3})(?:,\d{3})?)'
amount_pattern_asset = r'(\$\d{1,3}(?:,\d{3})(?:,\d{3})?)'
ticker_pattern = r'\((\w+)\)'
def printls(array):
print('[')
for item in array:
print(item)
print(']')
print()
def has_table(page):
text = page.extract_text()
return column_names.search(text)
def get_extract(page):
text = page.extract_text()
result = column_names.search(text)
if result:
# Get the end position of the matched text
start_position = result.end()
# Slice the text from the end position of the matched text to the end of the string
remaining_text = text[start_position:].strip()
return remaining_text
def pages_with_tables(pdf):
return [index for index, page in enumerate(pdf.pages) if has_table(page)]
def get_filings_from_pdf(pdf_path):
with pdfplumber.open(pdf_path) as pdf:
pages = [page for page in pdf.pages if has_table(page)]
all_text = ""
for page in pages:
important_text = get_extract(page)
all_text += important_text + "\n"
cleaned_text = all_text.replace('\x00', '')
return cleaned_text
def parse_string(s, filing):
# Extract stock and type
stock_type_match = re.search(stock_pattern, s)
if not stock_type_match:
return None
stock = stock_type_match.group(1).strip()
stock = re.sub(amount_pattern_asset,'',stock)
type_ = stock_type_match.group(2).strip()
# Extract dates
date_matches = re.findall(date_pattern, s)
if len(date_matches) < 2:
return None
date = date_matches[0]
notification_date = date_matches[1]
# Extract amount
amount_matches = re.findall(amount_pattern, s)
if not amount_matches:
return None
sorted_amounts = sorted(amount_matches, key=lambda x: int(x.replace(',', '')))
# check if purchase is stock or option
if '[OP]' in stock:
security = 'OP'
stock = stock.replace('[OP]', '')
elif '[ST]' in stock:
security = 'ST'
stock = stock.replace('[ST]', '')
elif '[AB]' in stock:
security = 'AB'
stock = stock.replace('[AB]', '')
else:
security = 'ST'
match = re.search(ticker_pattern, stock)
ticker = match.group(1) if match else None
return {'stock': stock, 'ticker': ticker, 'security': security, 'type': type_, 'date': date, 'notification_date': notification_date, 'min_amount': sorted_amounts[0], 'max_amount': sorted_amounts[1], 'filing': filing}
def parse_list1(array):
array = array[:-1]
processed_list = []
for item in array:
lines = item.split('\n')
if ('[ST]' in lines[-1] or '[OP]' in lines[-1] or '[AB]' in lines[-1]) and len(lines) > 1:
last_line = ""
while len(lines) > 1:
last_line += lines.pop()
# Find the position of the first date (assumed to be in the format MM/DD/YYYY)
match = re.search(r' S \(partial\) | P | S ', lines[0])
if match:
first_date_pos = match.start()
lines[0] = lines[0][:first_date_pos] + last_line + ' ' + lines[0][first_date_pos:]
processed_list.append('\n'.join(lines))
return processed_list
def convert_to_dict(array, filing):
return [parse_string(item, filing) for item in array]
def get_pdf_purchases(file):
file_name = os.path.splitext(os.path.basename(file))[0]
text = get_filings_from_pdf(file)
result = re.split(separator, text)
parsed1 = parse_list1(result)
parsed2 = convert_to_dict(parsed1, file_name)
for item in parsed2:
if item is None:
print("item is None")
return parsed2
def list_files(directory):
file_paths = []
for root, _, files in os.walk(directory):
for filename in files:
file_path = os.path.join(root, filename)
file_paths.append(file_path)
return file_paths
def insert_purchases_to_db(purchases, db):
for index, purchase in enumerate(purchases):
if purchase is None:
continue
exists = db.session.query(db_engine.Transaction).filter_by(asset=purchase['stock'], transaction_date=purchase['date'], min_amount=purchase['min_amount'], max_amount=purchase['max_amount'], docid=purchase['filing']).first()
if not exists:
db_purchase = db_engine.Transaction(asset=purchase['stock'], security=purchase['security'], transaction_type=purchase['type'], transaction_date=purchase['date'], notification_date=purchase['notification_date'], min_amount=purchase['min_amount'], max_amount=purchase['max_amount'], docid=purchase['filing'], ticker=purchase['ticker'])
db.session.add(db_purchase)
db.session.commit()
if __name__ == "__main__":
with open('config.yaml', 'r') as file:
config = yaml.safe_load(file)
db = db_engine.DbEngine(config['db'])
pdf_path = f'{config["data"]["dir"]}/pdf'
pdf_files = list_files(pdf_path)
purchases = []
for file in pdf_files:
print(file)
purchases.extend(get_pdf_purchases(file))
insert_purchases_to_db(purchases, db)