forked from FanhuaandLuomu/sinaFinanceSpider
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawlCompanyAnnouncement.py
158 lines (133 loc) · 4.77 KB
/
crawlCompanyAnnouncement.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
#coding:utf-8
# 爬取新浪财经网股票公司每日公告
# 提供日期即可 eg: 2017-02-21
import os
import math
import time
import datetime
import requests
import threading
from lxml import etree
# 爬取一条公告并保存
def spiderOnePiece(iurl,headers,datetime,filename):
# 去除文件名中的非法字符
invaild=['*','\\','/',':','\"','<','>','|','?']
for c in invaild:
if c in filename:
filename=filename.replace(c,'')
response=requests.get(iurl,headers=headers).content
page=etree.HTML(response)
content=page.xpath('//*[@id="content"]/pre')
if len(content)==0:
return
content=content[0].text
with open(datetime+os.sep+filename,'w') as f:
f.write(content.encode('utf-8'))
# 爬取一页
def spiderOnePage(url,headers,datetime):
website='http://vip.stock.finance.sina.com.cn'
response=requests.get(url,headers=headers).content
page=etree.HTML(response)
trList=page.xpath(r'//*[@id="wrap"]/div[@class="Container"]/table/tbody/tr')
print len(trList)
if len(trList)==1: # 爬取结束 该行(对不起没有相关记录)
return 0
if not os.path.exists(datetime): # 创建日期文件夹
os.mkdir(datetime)
for item in trList:
aUrl=item.xpath('th/a[1]')
title=aUrl[0].text # 公告标题
href=aUrl[0].attrib['href'] # 公告uri
href=website+href # 公告url
atype=item.xpath('td[1]')[0].text # 公告类型
spiderOnePiece(href,headers,datetime,title+'_'+atype+'.txt')
return 1
# 爬取一天
def spiderOneDay(url,headers,datetime,log_path='log'):
url=url.replace('#datetime#',datetime) # 填充日期
flag=1 # 爬取成功标志
index=1 # 起始页
f=open(log_path+os.sep+datetime+'.txt','a')
while flag:
t_url=url+str(index)
try:
flag=spiderOnePage(t_url,headers,datetime)
except Exception,e:
print 'err:',e
flag=0
finally:
if flag:
print '%s page_%d load success,continue.' %(datetime,index)
f.write('%s_page_%d load success.\n' %(datetime,index))
f.flush()
else:
print '%s page_%d load fail,end.' %(datetime,index)
f.write('%s_page_%d load failed.\n' %(datetime,index))
f.flush()
index+=1
f.close()
# 爬取一组天股票公司的数据
def spiderOneGroupDays(url,headers,date_group,log_path):
for idate in date_group:
try:
spiderOneDay(url,headers,idate,log_path)
print '%s has load success.over.' %idate
except Exception,e:
print 'err:',e
continue
# 获取指定起始日期[包含]--结束日期[包含]之间的日期
def getBetweenDay(begin_date,end_date):
date_list=[]
begin_date=datetime.datetime.strptime(begin_date,'%Y-%m-%d')
# 现在的日期
now_date=datetime.datetime.strptime(time.strftime('%Y-%m-%d',time.localtime(time.time())),'%Y-%m-%d')
end_date=datetime.datetime.strptime(end_date,'%Y-%m-%d')
# 如果给出的结束日期大于现在的日期 则将今天的日期作为结束日期
if end_date>now_date:
end_date=now_date
while begin_date<=end_date:
date_str=begin_date.strftime('%Y-%m-%d')
date_list.append(date_str)
begin_date+=datetime.timedelta(days=1)
return date_list
# 将date_list 平均分成threadNum组 最后一组可能较少
def split_date_list(date_list,threadNum):
# length=(len(date_list)/threadNum if len(date_list)%threadNum==0 else len(date_list)/threadNum+1)
length=int(math.ceil(len(date_list)*1.0/threadNum))
return [date_list[m:m+length] for m in range(0,len(date_list),length)]
def main():
headers = {
"Accept-Language": "zh-CN,zh;q=0.8",
"Accept-Encoding": "gzip, deflate, sdch",
"Host": "vip.stock.finance.sina.com.cn",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Upgrade-Insecure-Requests": "1",
"Connection": "keep-alive",
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36"
}
url='http://vip.stock.finance.sina.com.cn/corp/view/vCB_BulletinGather.php?gg_date=#datetime#&page='
log_path='log'
if not os.path.exists(log_path):
os.mkdir(log_path)
# datetime='2017-02-19'
# spiderOneDay(url,headers,datetime,log_path)
begin_date='2017-01-01'
end_date='2017-01-31'
# begin_date[包含]-->end_date[包含] 之间的所有date
date_list=getBetweenDay(begin_date,end_date)
print '%s-%s:%d days.' %(begin_date,end_date,len(date_list))
cut_date_list=split_date_list(date_list,4)
print cut_date_list
threads=[]
for dgroup in cut_date_list:
t=threading.Thread(target=spiderOneGroupDays,args=(url,headers,dgroup,log_path,))
threads.append(t)
# 开始线程
for t in threads:
t.start()
# 等待所有线程结束 阻塞主线程
for t in threads:
t.join()
print 'all load success...'
if __name__ == '__main__':
main()