Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

update the constant name and imports #4

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 12 additions & 13 deletions WikiExtractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,12 +52,11 @@
import argparse
import gc
import sys
import urllib.request, urllib.parse, urllib.error
import urllib.request
import re
import bz2
import os.path
from html.entities import name2codepoint
#import fnmatch
import shutil
import mimetypes
import gzip
Expand All @@ -70,7 +69,7 @@
### PARAMS ####################################################################

# This is obtained from the dump itself
prefix = None
PREFIX = None

##
# Whether to preseve links in output
Expand All @@ -86,12 +85,12 @@
# Recognize only these namespaces
# w: Internal links to the Wikipedia
#
acceptedNamespaces = set(['w'])
ACCEPTED_NAMESPACES= set(['w'])

##
# Drop these elements from article text
#
discardElements = set([
DISCARD_ELEMENTS = set([
'gallery', 'timeline', 'noinclude', 'pre',
'table', 'tr', 'td', 'th', 'caption',
'form', 'input', 'select', 'option', 'textarea',
Expand Down Expand Up @@ -132,7 +131,7 @@
## print(footer, file=out)

def WikiDocumentSentences(out, id, title, tags, text):
url = get_url(id, prefix)
url = get_url(id, PREFIX)
header = '\n{0}:{1}'.format(title, "|||".join(tags))
# Separate header from text with a newline.
text = clean(text)
Expand Down Expand Up @@ -176,7 +175,7 @@ def normalizeTitle(title):
rest = m.group(3)

ns = prefix.capitalize()
if ns in acceptedNamespaces:
if ns in ACCEPTED_NAMESPACES:
# If the prefix designates a known namespace, then it might be
# followed by optional whitespace that should be removed to get
# the canonical page name
Expand Down Expand Up @@ -224,7 +223,7 @@ def fixup(m):

# Match elements to ignore
discard_element_patterns = []
for tag in discardElements:
for tag in DISCARD_ELEMENTS:
pattern = re.compile(r'<\s*%s\b[^>]*>.*?<\s*/\s*%s>' % (tag, tag), re.DOTALL | re.IGNORECASE)
discard_element_patterns.append(pattern)

Expand Down Expand Up @@ -353,7 +352,7 @@ def make_anchor_tag(match):
global keepLinks
link = match.group(1)
colon = link.find(':')
if colon > 0 and link[:colon] not in acceptedNamespaces:
if colon > 0 and link[:colon] not in ACCEPTED_NAMESPACES:
return ''
trail = match.group(3)
anchor = match.group(2)
Expand Down Expand Up @@ -587,7 +586,7 @@ def file_name(self):

def process_data(ftype, input, output_sentences, output_structure, incubator,
vital_titles=None, vital_tags=None):
global prefix
global PREFIX
page = []
id = None
inText = False
Expand Down Expand Up @@ -625,7 +624,7 @@ def process_data(ftype, input, output_sentences, output_structure, incubator,
page.append(line)
elif tag == '/page':
colon = title.find(':')
if (colon < 0 or title[:colon] in acceptedNamespaces) and \
if (colon < 0 or title[:colon] in ACCEPTED_NAMESPACES) and \
not redirect:
if (not vital_titles) or (title in vital_titles):
if((incubator != '') and (lang[1] == incubator) and len(lang) > 2):
Expand All @@ -648,7 +647,7 @@ def process_data(ftype, input, output_sentences, output_structure, incubator,
# discover prefix from the xml dump file
# /mediawiki/siteinfo/base
base = m.group(3)
prefix = base[:base.rfind("/")]
PREFIX = base[:base.rfind("/")]

##def load_vital_titles(vitalfn):
## """Given the filename for the vital titles list (one title per line, with
Expand Down Expand Up @@ -698,7 +697,7 @@ def get_argparser():
return parser

def main():
global keepLinks, keepSections, prefix, acceptedNamespaces
global keepLinks, keepSections, PREFIX, ACCEPTED_NAMESPACES
script_name = os.path.basename(sys.argv[0])

parser = get_argparser()
Expand Down