-
Notifications
You must be signed in to change notification settings - Fork 2
/
SimpleWikiLinkParser.java
123 lines (114 loc) · 3.69 KB
/
SimpleWikiLinkParser.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Set;
/**
* Searches the html contents of a wikipedia page to grab in-text links (relative links).
* This will not return any external links, sidebar links or reference links.
*
* This will try to follow the nicencess policies on wikipedia's robots.txt.
* This will also ignore any subsection links (links with "#"), any links with "&"
* and any links with ":". Finally, this will also not record any of wikipedia's redlinks,
* or any pages related to the wikifoundation.
*
* Links returned will be relative wikipedia links: "/wiki/title_of_article"
*
* All error handling related to connectivity and bad URLs is done internally.
* @author Alex Shum
*/
public class SimpleWikiLinkParser {
private URL startPage;
private Set<String> links;
/**
* Creates a new link parser object. Uses full URL not relative URL.
* @param startPage Full URL of page to get links from.
*/
public SimpleWikiLinkParser(String startPage) {
try {
this.startPage = new URL(startPage);
} catch (MalformedURLException e) {
e.printStackTrace();
}
links = new LinkedHashSet<String>();
BufferedReader br;
try {
br = new BufferedReader(new InputStreamReader(this.startPage.openStream()));
String line;
boolean startRead = false;
while((line = br.readLine()) != null) {
if(line.contains("<p>")) {
startRead = true;
}
if(startRead && line.contains("<a href=") ) {
links.addAll(findAllURLs(line));
}
}
br.close();
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* This will return a set of the in-text links located on the startPage.
* Links will be relative wikipedia links: "/wiki/title_of_article".
* @return Set of in-text wikipedia links.
*/
public Set<String> getLinks() {
return(links);
}
/**
* Finds all URLs from a line of html.
* Ignores links that are restricted:
* robots.txt restricted, external, "#", ":", "&" and redlinks.
* @param html Line of html to find URLs.
* @return List of URLs in this line of html.
*/
private List<String> findAllURLs(String html) {
List<String> urls = new ArrayList<String>();
int startPos = 0;
int endPos = 0;
boolean ignoreLine;
while(startPos >= 0) {
startPos = html.indexOf("<a href=", startPos);
if(startPos >= 0) {
startPos = html.indexOf("\"", startPos) + 1;
endPos = html.indexOf("\"", startPos);
ignoreLine = false;
for(String i : NOT_ALLOWED) {
if(html.substring(startPos, endPos).contains(i)) ignoreLine = true;
}
if(!ignoreLine) {
urls.add(html.substring(startPos, endPos));
}
}
}
return(urls);
}
//restricted links
@SuppressWarnings("serial")
private static final List<String> NOT_ALLOWED = new ArrayList<String>() {{
add("trap"); add("/wiki/Special");
add("/wiki/Wikipedia:Articles_for_deletion");
add("/wiki/Wikipedia:Votes_for_deletion");
add("/wiki/Wikipedia:Pages_for_deletion");
add("/wiki/Wikipedia:Miscellany_for_deletion");
add("/wiki/Wikipedia:Miscellaneous_deletion");
add("/wiki/Wikipedia:Copyright_problems");
add("/wiki/Wikipedia:Protected_titles");
add("/wiki/Wikipedia:WikiProject_Spam");
add("/wiki/MediaWiki:Spam-blacklist");
add("/wiki/MediaWiki_talk:Spam-blacklist");
add("/wiki/Portal:Prepared_stories");
add("/wiki/Wikibooks:Votes_for_deletion");
add("/wiki/Wikipedia:Requests_for_arbitration");
add("redlink=1;");
add("/wiki/Main_Page");
add(".org"); add(".net"); add(".com");
add("#"); add(":"); add("&");
}};
}