-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathWebparser.cs
175 lines (153 loc) · 6.19 KB
/
Webparser.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
using HtmlAgilityPack;
using System;
using System.Collections.Generic;
using System.Drawing;
using System.IO;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
namespace 顶点云
{
public class Webparser:NovelsCollectionInfo
{
public Webparser(string url)
{
_defaultUri = url;
}
static string _defaultUri;
static string _cachelocal= @"G:\Visualproject\顶点云\Cache";
/*
懒得写正则了,cnm正则是真的难写,后面两个暂时写的能用就用吧
*/
//提取href正则
static readonly Regex reg_href = new Regex("<a([\\s]+|[\\s]+[^<>]+[\\s]+)href=(\"(?<href>[^<>\"']*)\"|'(?<href>[^<>\"']*)')[^<>]*");
//提取img正则
static readonly Regex reg_rec = new Regex(@"(?i)<img[^>]*?\ssrc\s*=\s*(['""]?)(?<src>[^'""\s>]+)\1[^>]*>");
public enum Types
{
科幻小说,
网游小说,
穿越小说,
都市小说,
修真小说,
玄幻小说,
仙侠小说,
耽美小说,
言情小说,
推理小说,
恐怖灵异,
历史小说,
名著,
散文,
其他
}
/// <summary>
/// 异步线程集合
/// </summary>
/// <returns></returns>
public void GetCollection()
{
Htmltest(_defaultUri);
int i = 0;
foreach (string item in _nameInfo)
{
SetImage(item,Path.Combine(_cachelocal,"picture",i+".cache"));
i++;
}
Set_ListCache("https://www.booktxt.net/5_5552/", Path.Combine(_cachelocal, "lists", i + ".cache"));
}
private void Set_ListCache(string url, string cache)
{
if (!Directory.Exists(Path.Combine(_cachelocal, "lists")))
{
Directory.CreateDirectory(Path.Combine(_cachelocal, "lists"));
}
File.AppendAllLines(cache, ListFromLink(url));
}
/// <summary>
/// 提取图片
/// </summary>
public static void SetImage(string url,string cache)
{
if (!Directory.Exists(Path.Combine(_cachelocal, "picture")))
{
Directory.CreateDirectory(Path.Combine(_cachelocal, "picture"));
}
Uri Adress = new Uri(url);
HttpWebRequest Request = (HttpWebRequest)WebRequest.Create(Adress);
Request.KeepAlive = true;
Request.ProtocolVersion = HttpVersion.Version11;
Request.Method = "GET";
Request.Accept = "*/* ";
Request.UserAgent = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.56 Safari/536.5";
Request.Referer = Adress.AbsolutePath;
HttpWebResponse htmlResponse = (HttpWebResponse)Request.GetResponse();
using (Stream st = htmlResponse.GetResponseStream())
{
Image bt = Bitmap.FromStream(st,true);
//图片保存在缓存文件中
bt.Save(cache);
}
}
public static string _s="";
public static string _st = "";
/// <summary>
/// 提取图片链接
/// </summary>
/// <param name="http"></param>
public static void Htmltest(object http)
{
HtmlWeb web = new HtmlWeb();
HtmlDocument htmldoc = web.Load(http as string);
_nameInfo = new List<string>();
_coverInfo = new List<string>();
//HtmlNode _collect = htmldoc.DocumentNode.SelectSingleNode("//div[@class='l']");
HtmlNode hot_node = htmldoc.DocumentNode.SelectSingleNode("//div[@id='hotcontent']/div[@class='l']");
HtmlNodeCollection _collection = hot_node.SelectNodes("//div[@class='item']");
foreach(HtmlNode _htmlnode in _collection)
{
//HtmlNode _Mv = _htmlnode.SelectSingleNode("//div[@class='item']");
HtmlNode _Nv = _htmlnode.SelectSingleNode("//img[@src]");
HtmlNode _Cv = _htmlnode.SelectSingleNode("//div[@class='image']/a[@href]");
_nameInfo.Add(_Nv.Attributes[0].Value);
_coverInfo.Add(_Cv.Attributes[0].Value);
}
}
/// <summary>
/// 提取目录
/// </summary>
/// <param name="http"></param>
/// <returns></returns>
public static List<string> ListFromLink(object http)
{
HtmlWeb list_web = new HtmlWeb();
list_web.OverrideEncoding = Encoding.GetEncoding("gb2312");
HtmlDocument list_doc = list_web.Load(http as string);
List<string> l_ss = new List<string>();
l_ss.Add(list_doc.Encoding.ToString());
HtmlNode list_node = list_doc.DocumentNode.SelectSingleNode("//div[@id='list']");
HtmlNodeCollection list_collec = list_node.SelectNodes("//dd/a[@href]");
foreach(HtmlNode item in list_collec)
{
l_ss.Add(item.InnerText+"|"+item.Attributes["href"].Value);
}
return l_ss;
}
/// <summary>
/// 提取网页中的章节内容
/// </summary>
/// <param name="http"></param>
/// <returns></returns>
public static string ContentFromLink(object http)
{
HtmlWeb content_web = new HtmlWeb();
content_web.OverrideEncoding = Encoding.GetEncoding("gb2312");
StringBuilder s_bu = new StringBuilder();
HtmlDocument content_doc = content_web.Load(http as string);
HtmlNode content_nodes = content_doc.DocumentNode.SelectSingleNode("//div[@id='content']");
s_bu.Append(content_nodes.InnerText.Replace(" ", "\n"));
return s_bu.ToString();
}
}
}