-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
9 changed files
with
821 additions
and
32 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
# find : \s*\n(\s*#\s*[-]+\s*)+\n*^def | ||
# find : \s*\n(#\s*[-]+\s*)+\n*def | ||
# replace : \n\n\ndef | ||
|
||
|
||
# find : \s*\n(\s*#\s*[-]+\s*)+\n*^(\s*def ) | ||
# replace : \n\n\n$2 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
#!/usr/bin/python3 | ||
""" | ||
python3 core8/pwb.py copy_text/files_list | ||
""" | ||
import sys | ||
import json | ||
from pathlib import Path | ||
from newapi import printe | ||
from mdapi_sql import sql_for_mdwiki | ||
from apis.cat_cach import Cat_Depth | ||
|
||
dir1 = Path(__file__).parent | ||
Dir = "/data/project/mdwiki/public_html/mdtexts" | ||
|
||
if str(dir1).find("I:") != -1: | ||
Dir = "I:/mdwiki/mdwiki/public_html/mdtexts" | ||
|
||
Dir = Path(Dir) | ||
|
||
|
||
def fix_title(x): | ||
return x.replace(" ", "_").replace("'", "_").replace(":", "_").replace("/", "_").replace('"', "_") | ||
|
||
|
||
def cats_pages(): | ||
# --- | ||
all_pages = [] | ||
to_cats = {} | ||
# --- | ||
cats = sql_for_mdwiki.get_db_categories() | ||
# --- | ||
# sort cats making RTT the last item | ||
cats = dict(sorted(cats.items(), key=lambda x: x[0] == "RTT")) | ||
# --- | ||
for cat, depth in cats.items(): | ||
# --- | ||
ca = Cat_Depth(cat, depth=depth, ns="all", print_s=False) | ||
# --- | ||
ca_list = [x for x in ca if x not in all_pages] | ||
# --- | ||
printe.output(f"<<green>> ca_list({cat}): {len(ca_list)}") | ||
# --- | ||
to_cats[cat] = ca_list | ||
# --- | ||
all_pages.extend(ca_list) | ||
# --- | ||
return to_cats | ||
|
||
|
||
def dump_titles(titles): | ||
# --- | ||
file = Dir / "cats_titles.json" | ||
# --- | ||
data = {} | ||
# --- | ||
if file.exists(): | ||
# read data | ||
with open(file, "r", encoding="utf-8") as f: | ||
data = json.load(f) | ||
printe.output(f"<<yellow>> titles in titles.json: {len(data)}") | ||
# --- | ||
for cat, cat_titles in titles.items(): | ||
new_data = {x: fix_title(x) for x in cat_titles if x not in data.get(cat, [])} | ||
printe.output(f"<<yellow>> cat_titles({cat}) in new_data: {len(new_data)}") | ||
# --- | ||
data.setdefault(cat, {}) | ||
# --- | ||
# merge data | ||
data[cat].update(new_data) | ||
# --- | ||
with open(file, "w", encoding="utf-8") as f: | ||
json.dump(data, f, indent=4, ensure_ascii=False) | ||
# --- | ||
return | ||
|
||
|
||
def main(): | ||
# --- | ||
all_pages = cats_pages() | ||
# --- | ||
print(f"all_pages: {len(all_pages)}") | ||
# --- | ||
dump_titles(all_pages) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
#!/usr/bin/python3 | ||
""" | ||
python3 core8/pwb.py copy_text/scan_files | ||
""" | ||
import sys | ||
import tqdm | ||
import os | ||
from pathlib import Path | ||
from newapi import printe | ||
|
||
dir1 = Path(__file__).parent | ||
Dir = "/data/project/mdwiki/public_html/mdtexts" | ||
|
||
if str(dir1).find("I:") != -1: | ||
Dir = "I:/mdwiki/mdwiki/public_html/mdtexts" | ||
|
||
Dir = Path(Dir) | ||
|
||
paths = [ | ||
Dir / "html", | ||
Dir / "segments", | ||
] | ||
|
||
to_del = [] | ||
|
||
for path in paths: | ||
files = list(path.glob("*.html")) | ||
|
||
for n, file in tqdm.tqdm(enumerate(files, 1), total=len(files)): | ||
# printe.output(f"<<yellow>> f: {n}/{len(files)} : {file}") | ||
|
||
with open(file, "r", encoding="utf-8") as f: | ||
text = f.read() | ||
|
||
if text.find(">Wikimedia Error<") != -1: | ||
to_del.append(file) | ||
|
||
for n, file in enumerate(to_del, 1): | ||
printe.output(f"<<red>> f: {n}/{len(to_del)} : Error: {file}") | ||
# del the file | ||
if "del" in sys.argv: | ||
os.remove(file) | ||
continue |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,120 @@ | ||
#!/usr/bin/python3 | ||
""" | ||
""" | ||
import re | ||
from apis import mdwiki_api | ||
from newapi.super import super_page | ||
from newapi.super import catdepth_new | ||
|
||
from copy_to_en.bots import medwiki_account | ||
from copy_to_en.bots import text_changes # text = text_changes.work(text) | ||
from copy_to_en.bots.ref import fix_ref # text = fix_ref(first, alltext) | ||
from mdapi_sql import sql_for_mdwiki | ||
|
||
# --- | ||
User_tables = { | ||
"username": medwiki_account.username, | ||
"password": medwiki_account.password, | ||
} | ||
# --- | ||
catdepth_new.User_tables["toolforge"] = User_tables | ||
super_page.User_tables["toolforge"] = User_tables | ||
# --- | ||
CatDepth = catdepth_new.subcatquery | ||
MainPage = super_page.MainPage | ||
|
||
text_cache = {} | ||
revid_cache = {} | ||
un_wb_tag_cache = {} | ||
|
||
mdwiki_cats = sql_for_mdwiki.get_db_categories() | ||
|
||
|
||
def get_cats(alltext): | ||
# --- | ||
cats = [] | ||
# --- | ||
for category in mdwiki_cats: | ||
# --- | ||
mat = re.search(rf"\[\[Category:{category}(\]\]|\|)", alltext, re.IGNORECASE) | ||
# --- | ||
if mat: | ||
cats.append(category) | ||
# --- | ||
cats = list(set(cats)) | ||
# --- | ||
# if len(cats) > 1 and "RTT" in cats: cats.remove("RTT") | ||
# --- | ||
cats_text = "\n".join([f"[[Category:{x}]]" for x in cats]) | ||
# --- | ||
return cats_text | ||
|
||
|
||
def get_text_revid(x): | ||
alltext, revid = mdwiki_api.GetPageText(x, get_revid=True) | ||
# --- | ||
text_cache[x] = alltext | ||
revid_cache[x] = revid | ||
# --- | ||
return alltext, revid | ||
|
||
|
||
def get_un_wb_tag(alltext, x): | ||
# search for text like {{#unlinkedwikibase:id=Q423364}} | ||
pattern = r"\{\{#unlinkedwikibase:id=Q[0-9]+\}\}" | ||
# --- | ||
match = re.search(pattern, alltext) | ||
# --- | ||
unlinkedwikibase = match.group(0) if match else "" | ||
# --- | ||
# matches = re.findall(pattern, alltext) | ||
# for m in matches: | ||
# unlinkedwikibase = m | ||
# break | ||
# --- | ||
un_wb_tag_cache[x] = unlinkedwikibase | ||
# --- | ||
return unlinkedwikibase | ||
|
||
|
||
def get_text(x): | ||
"""Retrieve and process text from a specified page. | ||
This function fetches the text content of a page using the | ||
`mdwiki_api.GetPageText` method. It processes the retrieved text to | ||
extract and format specific information, including handling unlinked | ||
Wikibase IDs and adjusting the infobox formatting. The function also | ||
ensures that references are properly formatted and included in the | ||
output. | ||
Args: | ||
x (str): The identifier of the page from which to retrieve text. | ||
Returns: | ||
tuple: A tuple containing the processed text and the revision ID | ||
of the page. | ||
""" | ||
alltext, revid = get_text_revid(x) | ||
# --- | ||
if not alltext: | ||
print("no text: " + x) | ||
return "", "" | ||
# --- | ||
page_cats = get_cats(alltext) | ||
# --- | ||
unlinkedwikibase = get_un_wb_tag(alltext, x) | ||
# --- | ||
first = alltext.split("==")[0].strip() | ||
# --- | ||
first = first + "\n\n==References==\n<references />" | ||
newtext = first | ||
# --- | ||
newtext = fix_ref(first, alltext) | ||
# --- | ||
newtext = text_changes.do_text_fixes(newtext) | ||
# --- | ||
newtext += "\n[[Category:Mdwiki Translation Dashboard articles]]" | ||
# --- | ||
revid_temp = f"{{{{mdwiki revid|{revid}}}}}" | ||
# --- | ||
newtext = f"{unlinkedwikibase}\n{revid_temp}\n{newtext}\n{page_cats}" | ||
# --- | ||
return newtext, revid |
Oops, something went wrong.