-
Notifications
You must be signed in to change notification settings - Fork 0
/
email_scraper.py
56 lines (42 loc) · 1.25 KB
/
email_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed May 9 13:19:35 2018
@author: Michael
"""
from pyspark import SparkContext
import os
import re
import io
import csv
def main():
def get_email(file):
f = io.open(file, mode="r", encoding="utf-8")
a = []
for line in f:
if re.findall(r"(\S+)@(\S+)", line):
a.append(line)
return clean_list(a)
def clean_list(x):
for i in range(len(x)):
x[i] = x[i].replace(u'\xa0',u' ')
x[i] = x[i].replace(u'\n','')
return x
def one_list(lists):
results = []
for numbers in lists:
for number in numbers:
results.append(number)
return results
total_list = []
#indir = '/Akamai_scratch/arxiv/outdir3'
indir = '/Users/Michael/Desktop/test'
for root, dirs, filenames in os.walk(indir):
for f in filenames:
total_list.append(get_email(f))
with open('emails.csv','w', newline="") as file:
cw = csv.writer(file)
cw.writerows(r+[""] for r in total_list)
wr = csv.writer(file, quoting=csv.QUOTE_ALL)
wr.writerow(total_list)
if __name__ == '__main__': main()