-
Notifications
You must be signed in to change notification settings - Fork 0
/
readabilityAnalyzer.php
163 lines (134 loc) · 5.34 KB
/
readabilityAnalyzer.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
<?php
class ReadabilityAnalyzer {
var $host ="https://en.wikipedia.org";
var $endpoint = "/w/api.php?action=query&format=json&utf8=1";
var $categoryNamespace = "Category";
var $curl;
var $curlLog;
function __construct() {
$this->curl = curl_init();
curl_setopt($this->curl, CURLOPT_USERAGENT, 'wikipedia-readability/1.0 (https://wasmitnetzen.de/wikipedia-readability; irgend-wikipedia-readability@wasmitnetzen.de)');
curl_setopt($this->curl, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($this->curl, CURLOPT_VERBOSE, 1);
$this->curlLog = fopen("curl.log", 'w');
curl_setopt($this->curl, CURLOPT_STDERR, $this->curlLog);
}
function __destruct() {
fclose($this->curlLog);
}
function buildResults($category) {
// check if we need to prepend namespace
if (strpos($category,$this->categoryNamespace) === false) {
$category = $this->categoryNamespace.":".$category;
}
// format name for mediawiki
$category = str_replace(" ", "_",$category);
// TODO: check if category exists
// fetch data for category
$categoryContent = json_decode($this->fetchCategory($category));
// parse JSON
$categoryMembers = $categoryContent->query->categorymembers;
$categoryMembersCount = count($categoryMembers);
$pageIds = "";
$articleCount = 0;
$extractArray = array();
// go through each article
foreach ($categoryMembers as $article) {
$pageId = $article->pageid;
$title = $article->title;
// build request string
$pageIds .= $pageId."|";
$articleCount++;
// send request at every 20 articles (due to API limit) and at the end
if ($articleCount % 20 == 0 || $articleCount == $categoryMembersCount) {
// cut off last |
$pageIds = substr($pageIds,0,-1);
// fetch articles identified by $pageIds
$articleResponse = $this->fetchArticles($pageIds);
// store result
$articles = $articleResponse->query->pages;
foreach ($articles as $key => $value) {
$extractArray[$key] = $value;
}
// reset $pageIds
$pageIds = "";
}
}
// check if results and query have the same amount of articles
if ($categoryMembersCount !== count($extractArray)) {
echo "Not all articles were returned by the API.<br>";
}
// calculate readability
$extractArray = $this->calculateReadabilities($extractArray);
// sort by readability
usort($extractArray, 'readabilitySort');
// build table
$table = '<table class="table table-bordered table-striped"><thead><tr><th>Article</th><th><a title="Higher is better">Readability Score</a></th><th>Extract</th><th>Further Categories</th></tr></thead><tbody>';
foreach ($extractArray as $index => $article) {
$categoriesString = "";
foreach ($article->categories as $key => $categoryObject) {
// dont include current category
if ($categoryObject->title == $category) {
continue;
}
// link to category, remove namespace for the text
$categoriesString .= "<a href='index.php?category=".$categoryObject->title."'>".substr($categoryObject->title,(strlen($this->categoryNamespace))+1)."</a> | ";
}
$categoriesString = substr($categoriesString, 0,-3);
$table .= "<tr><td><a href='".$this->host."/wiki/".$article->title."'>".$article->title."</a></td><td>".$article->readabilityFormatted."</td><td>".substr($article->extract, 0, 60)."...</td><td>".$categoriesString."</td></tr>";
}
$table .= '</tbody></table>';
return $table;
}
function fetchCategory($category) {
curl_setopt($this->curl, CURLOPT_URL, $this->host.$this->endpoint."&list=categorymembers&redirects=1&cmlimit=50&cmnamespace=0&cmtitle=".$category);
$data = curl_exec($this->curl);
return $data;
}
function fetchArticles($pageIds) {
// fetch extracts
curl_setopt($this->curl, CURLOPT_URL, $this->host.$this->endpoint."&prop=extracts&redirects=1&explaintext=1&exchars=20000&exintro=1&exlimit=20&pageids=".$pageIds);
$data = json_decode(curl_exec($this->curl));
// also fetch categories
curl_setopt($this->curl, CURLOPT_URL, $this->host.$this->endpoint."&prop=categories&redirects=1&clshow=!hidden&cllimit=500&pageids=".$pageIds);
$data2 = json_decode(curl_exec($this->curl));
$categories = $data2->query->pages;
// attach categories to corresponding page object
foreach ($data->query->pages as $key => $page) {
$pageid = $page->pageid;
$page->categories = $categories->$pageid->categories;
}
return $data;
}
function calculateReadabilities($extractArray) {
foreach ($extractArray as $key => $extract) {
// get readability for each extract
$extract->readabilityScore = $this->calculateReadabilityOnExtract($extract->extract);
// get nicely formatted number as percentage
$extract->readabilityFormatted = number_format($extract->readabilityScore*100,2)."%";
}
return $extractArray;
}
function calculateReadabilityOnExtract($extract) {
// cut off extract after first paragraph
$extract = explode("\n",$extract)[0];
// based on https://xkcd.com/1133/
include("simpleWords.php");
// divie extract into words
$words = explode(" ", $extract);
// count amount of simple words in extract
$simpleWordsCount = 0;
foreach ($words as $word) {
if (in_array($word,$simpleWords)) {
$simpleWordsCount++;
}
}
// value is then the amount of simple words in the full text
return floatval($simpleWordsCount)/floatval(count($words));
}
}
function readabilitySort($a, $b)
{
return $a->readabilityScore > $b->readabilityScore;
}
?>