From 73d9b31cb8c2dbb7fe44d4744e113d4acfab8986 Mon Sep 17 00:00:00 2001 From: ibrahem Date: Wed, 11 Dec 2024 22:44:58 +0300 Subject: [PATCH] 1 --- copy_text/__init__.py | 7 ++ copy_text/files_list.py | 90 ++++++++++++++ copy_text/scan_files.py | 46 +++++++ copy_text/text_bot.py | 120 ++++++++++++++++++ copy_text/to_file2.py | 221 +++++++++++++++++++++++++++++++++ copy_to_en/to_file.py | 101 +++++++++++++++ copy_to_en/to_file2.py | 203 ++++++++++++++++++++++++++++++ md_core_helps/apis/cat_cach.py | 4 +- wprefs/wpref_text.py | 61 +++++---- 9 files changed, 821 insertions(+), 32 deletions(-) create mode 100644 copy_text/__init__.py create mode 100644 copy_text/files_list.py create mode 100644 copy_text/scan_files.py create mode 100644 copy_text/text_bot.py create mode 100644 copy_text/to_file2.py create mode 100644 copy_to_en/to_file.py create mode 100644 copy_to_en/to_file2.py diff --git a/copy_text/__init__.py b/copy_text/__init__.py new file mode 100644 index 00000000..a16d046a --- /dev/null +++ b/copy_text/__init__.py @@ -0,0 +1,7 @@ +# find : \s*\n(\s*#\s*[-]+\s*)+\n*^def +# find : \s*\n(#\s*[-]+\s*)+\n*def +# replace : \n\n\ndef + + +# find : \s*\n(\s*#\s*[-]+\s*)+\n*^(\s*def ) +# replace : \n\n\n$2 diff --git a/copy_text/files_list.py b/copy_text/files_list.py new file mode 100644 index 00000000..262087ae --- /dev/null +++ b/copy_text/files_list.py @@ -0,0 +1,90 @@ +#!/usr/bin/python3 +""" + +python3 core8/pwb.py copy_text/files_list + + +""" +import sys +import json +from pathlib import Path +from newapi import printe +from mdapi_sql import sql_for_mdwiki +from apis.cat_cach import Cat_Depth + +dir1 = Path(__file__).parent +Dir = "/data/project/mdwiki/public_html/mdtexts" + +if str(dir1).find("I:") != -1: + Dir = "I:/mdwiki/mdwiki/public_html/mdtexts" + +Dir = Path(Dir) + + +def fix_title(x): + return x.replace(" ", "_").replace("'", "_").replace(":", "_").replace("/", "_").replace('"', "_") + + +def cats_pages(): + # --- + all_pages = [] + to_cats = {} + # --- + cats = sql_for_mdwiki.get_db_categories() + # --- + # sort cats making RTT the last item + cats = dict(sorted(cats.items(), key=lambda x: x[0] == "RTT")) + # --- + for cat, depth in cats.items(): + # --- + ca = Cat_Depth(cat, depth=depth, ns="all", print_s=False) + # --- + ca_list = [x for x in ca if x not in all_pages] + # --- + printe.output(f"<> ca_list({cat}): {len(ca_list)}") + # --- + to_cats[cat] = ca_list + # --- + all_pages.extend(ca_list) + # --- + return to_cats + + +def dump_titles(titles): + # --- + file = Dir / "cats_titles.json" + # --- + data = {} + # --- + if file.exists(): + # read data + with open(file, "r", encoding="utf-8") as f: + data = json.load(f) + printe.output(f"<> titles in titles.json: {len(data)}") + # --- + for cat, cat_titles in titles.items(): + new_data = {x: fix_title(x) for x in cat_titles if x not in data.get(cat, [])} + printe.output(f"<> cat_titles({cat}) in new_data: {len(new_data)}") + # --- + data.setdefault(cat, {}) + # --- + # merge data + data[cat].update(new_data) + # --- + with open(file, "w", encoding="utf-8") as f: + json.dump(data, f, indent=4, ensure_ascii=False) + # --- + return + + +def main(): + # --- + all_pages = cats_pages() + # --- + print(f"all_pages: {len(all_pages)}") + # --- + dump_titles(all_pages) + + +if __name__ == "__main__": + main() diff --git a/copy_text/scan_files.py b/copy_text/scan_files.py new file mode 100644 index 00000000..271b7b76 --- /dev/null +++ b/copy_text/scan_files.py @@ -0,0 +1,46 @@ +#!/usr/bin/python3 +""" + +python3 core8/pwb.py copy_text/scan_files + + +""" +import sys +import tqdm +import os +from pathlib import Path +from newapi import printe + +dir1 = Path(__file__).parent +Dir = "/data/project/mdwiki/public_html/mdtexts" + +if str(dir1).find("I:") != -1: + Dir = "I:/mdwiki/mdwiki/public_html/mdtexts" + +Dir = Path(Dir) + +paths = [ + Dir / "html", + Dir / "segments", +] + +to_del = [] + +for path in paths: + files = list(path.glob("*.html")) + + for n, file in tqdm.tqdm(enumerate(files, 1), total=len(files)): + # printe.output(f"<> f: {n}/{len(files)} : {file}") + + with open(file, "r", encoding="utf-8") as f: + text = f.read() + + if text.find(">Wikimedia Error<") != -1: + to_del.append(file) + +for n, file in enumerate(to_del, 1): + printe.output(f"<> f: {n}/{len(to_del)} : Error: {file}") + # del the file + if "del" in sys.argv: + os.remove(file) + continue diff --git a/copy_text/text_bot.py b/copy_text/text_bot.py new file mode 100644 index 00000000..6f5f22ea --- /dev/null +++ b/copy_text/text_bot.py @@ -0,0 +1,120 @@ +#!/usr/bin/python3 +""" + +""" +import re +from apis import mdwiki_api +from newapi.super import super_page +from newapi.super import catdepth_new + +from copy_to_en.bots import medwiki_account +from copy_to_en.bots import text_changes # text = text_changes.work(text) +from copy_to_en.bots.ref import fix_ref # text = fix_ref(first, alltext) +from mdapi_sql import sql_for_mdwiki + +# --- +User_tables = { + "username": medwiki_account.username, + "password": medwiki_account.password, +} +# --- +catdepth_new.User_tables["toolforge"] = User_tables +super_page.User_tables["toolforge"] = User_tables +# --- +CatDepth = catdepth_new.subcatquery +MainPage = super_page.MainPage + +text_cache = {} +revid_cache = {} +un_wb_tag_cache = {} + +mdwiki_cats = sql_for_mdwiki.get_db_categories() + + +def get_cats(alltext): + # --- + cats = [] + # --- + for category in mdwiki_cats: + # --- + mat = re.search(rf"\[\[Category:{category}(\]\]|\|)", alltext, re.IGNORECASE) + # --- + if mat: + cats.append(category) + # --- + cats = list(set(cats)) + # --- + # if len(cats) > 1 and "RTT" in cats: cats.remove("RTT") + # --- + cats_text = "\n".join([f"[[Category:{x}]]" for x in cats]) + # --- + return cats_text + + +def get_text_revid(x): + alltext, revid = mdwiki_api.GetPageText(x, get_revid=True) + # --- + text_cache[x] = alltext + revid_cache[x] = revid + # --- + return alltext, revid + + +def get_un_wb_tag(alltext, x): + # search for text like {{#unlinkedwikibase:id=Q423364}} + pattern = r"\{\{#unlinkedwikibase:id=Q[0-9]+\}\}" + # --- + match = re.search(pattern, alltext) + # --- + unlinkedwikibase = match.group(0) if match else "" + # --- + # matches = re.findall(pattern, alltext) + # for m in matches: + # unlinkedwikibase = m + # break + # --- + un_wb_tag_cache[x] = unlinkedwikibase + # --- + return unlinkedwikibase + + +def get_text(x): + """Retrieve and process text from a specified page. + This function fetches the text content of a page using the + `mdwiki_api.GetPageText` method. It processes the retrieved text to + extract and format specific information, including handling unlinked + Wikibase IDs and adjusting the infobox formatting. The function also + ensures that references are properly formatted and included in the + output. + Args: + x (str): The identifier of the page from which to retrieve text. + Returns: + tuple: A tuple containing the processed text and the revision ID + of the page. + """ + alltext, revid = get_text_revid(x) + # --- + if not alltext: + print("no text: " + x) + return "", "" + # --- + page_cats = get_cats(alltext) + # --- + unlinkedwikibase = get_un_wb_tag(alltext, x) + # --- + first = alltext.split("==")[0].strip() + # --- + first = first + "\n\n==References==\n" + newtext = first + # --- + newtext = fix_ref(first, alltext) + # --- + newtext = text_changes.do_text_fixes(newtext) + # --- + newtext += "\n[[Category:Mdwiki Translation Dashboard articles]]" + # --- + revid_temp = f"{{{{mdwiki revid|{revid}}}}}" + # --- + newtext = f"{unlinkedwikibase}\n{revid_temp}\n{newtext}\n{page_cats}" + # --- + return newtext, revid diff --git a/copy_text/to_file2.py b/copy_text/to_file2.py new file mode 100644 index 00000000..f5397af2 --- /dev/null +++ b/copy_text/to_file2.py @@ -0,0 +1,221 @@ +#!/usr/bin/python3 +""" + +python3 core8/pwb.py copy_text/to_file2 + +tfj run tofiles --image python3.9 --command "$HOME/local/bin/python3 core8/pwb.py copy_text/to_file2" + +""" +import sys +import random +import json +import requests +from pathlib import Path +from multiprocessing import Pool + +from newapi import printe +from apis import cat_cach +from copy_text.text_bot import get_text + +dir1 = Path(__file__).parent +Dir = "/data/project/mdwiki/public_html/mdtexts" + +if str(dir1).find("I:") != -1: + Dir = "I:/mdwiki/mdwiki/public_html/mdtexts" + +Dir = Path(Dir) + +done_pages = {1: 0} +len_of_all_pages = {1: 0} + + +def fix_title(x): + return x.replace(" ", "_").replace("'", "_").replace(":", "_").replace("/", "_").replace('"', "_") + + +class WikiProcessor: + def __init__(self, title): + self.base_dir = Dir + self.title = title + self.sanitized_name = fix_title(self.title) + + done_pages[1] += 1 + + printe.output(f"p:{done_pages[1]}/{len_of_all_pages[1]} sanitized_name: {self.sanitized_name}") + + def html_to_segments(self, text): + url = "https://ncc2c.toolforge.org/textp" + headers = {"Content-Type": "application/json"} + payload = {"html": text} + try: + response = requests.post(url, headers=headers, json=payload) + response_data = response.json() + result = response_data.get("result", None) + + if result and result.find(">Wikimedia Error<") != -1: + return None + + return result + except requests.exceptions.RequestException as e: + printe.output(f"html_to_segments(): Error occurred: {e}") + return None + + def convert_wikitext_to_html(self, text): + end_point = "https://en.wikipedia.org/api/rest_v1/transform/wikitext/to/html/Sandbox" + params = {"wikitext": text} + headers = {"Content-Type": "application/json"} + try: + response = requests.post(end_point, headers=headers, data=json.dumps(params)) + + html_text = response.text + + if html_text and html_text.find(">Wikimedia Error<") != -1: + return None + + return html_text + except requests.exceptions.RequestException as e: + printe.output(f"convert_wikitext_to_html(): Error occurred: {e}") + return None + + def save_text(self, text, file_path): + try: + with open(file_path, "w", encoding="utf-8") as f: + f.write(text) + except Exception as e: + printe.output(f"save_text(): Exception: {e}") + + def get_page_text(self, page_name): + newtext, revid = get_text(page_name) + + if not newtext: + return None + + file_path = self.base_dir / f"wikitext/{self.sanitized_name}.txt" + self.save_text(newtext, file_path) + + printe.output("<> get_page_text True.") + + return newtext + + def to_html(self, wikitext): + html = self.convert_wikitext_to_html(wikitext) + + if not html: + return None + + file_path = self.base_dir / f"html/{self.sanitized_name}.html" + self.save_text(html, file_path) + + printe.output("<> to_html True.") + return html + + def to_segments(self, html_text): + segments = self.html_to_segments(html_text) + + if not segments: + return None + + file_path = self.base_dir / f"segments/{self.sanitized_name}.html" + self.save_text(segments, file_path) + + printe.output("<> to_segments True.") + + return segments + + def run(self): + wikitext = self.get_page_text(self.title) + + if not wikitext: + printe.output("wikitext is empty..") + return + + html_text = self.to_html(wikitext) + + if html_text: + segments = self.to_segments(html_text) + + +def one_page_new(title): + bot = WikiProcessor(title) + bot.run() + + +def get_all(): + file = Dir / "all_pages.json" + # ---- + if file.exists() and "nodone" not in sys.argv: + return json.loads(file.read_text()) + # ---- + all_pages = cat_cach.make_cash_to_cats(return_all_pages=True, print_s=False) + # --- + with open(file, "w", encoding="utf-8") as f: + f.write(json.dumps(all_pages)) + # --- + return all_pages + + +def start(all_pages): + # --- + if "slash" in sys.argv: + all_pages = [x for x in all_pages if x.find("/") != -1] + # --- + len_of_all_pages[1] = len(all_pages) + # --- + # sort all_pages randmly + random.shuffle(all_pages) + # --- + if "multi" in sys.argv: + pool = Pool(processes=2) + pool.map(one_page_new, all_pages) + pool.close() + pool.terminate() + return + # --- + for n, x in enumerate(all_pages): + print(f"{n}/{len(all_pages)} : {x}") + # --- + one_page_new(x) + + +def get_done(all_pages): + # --- + all_pages = [x.replace(" ", "_") for x in all_pages] + # --- + dir_to_fetch = Dir / "segments" + # --- + done = list(dir_to_fetch.glob("*.html")) + # --- + done = [str(x.name).replace(".html", "") for x in done] + # --- + not_done = [x for x in all_pages if x not in done] + # --- + for x in not_done: + x2 = fix_title(x) + # --- + if x2 in done: + done.append(x) + # --- + return done + + +def main(): + # --- + all_pages = get_all() + # --- + print(f"all_pages: {len(all_pages)}") + # --- + if "nodone" not in sys.argv: + done = get_done(all_pages) + # --- + print(f" done: {len(done)}. add 'nodone' to sys.argv to skip find done pages.") + # --- + all_pages = [x for x in all_pages if x not in done] + # --- + start(all_pages) + + +if __name__ == "__main__": + if "test" in sys.argv: + one_page_new("Menopause") + else: + main() diff --git a/copy_to_en/to_file.py b/copy_to_en/to_file.py new file mode 100644 index 00000000..28321915 --- /dev/null +++ b/copy_to_en/to_file.py @@ -0,0 +1,101 @@ +#!/usr/bin/python3 +""" + +python3 core8/pwb.py copy_to_en/to_file + +""" +import json +import requests +from newapi import printe +from pathlib import Path +from copy_to_en.medwiki import get_text + +dir1 = Path(__file__).parent +Dir = "/data/project/mdwiki/public_html/mdtexts" + +if str(dir1).find("I:") != -1: + Dir = "I:/mdwiki/mdwiki/public_html/mdtexts" + + +def html_to_segements(text): + url = "https://ncc2c.toolforge.org/textp" + headers = {"Content-Type": "application/json"} + payload = {"html": text} + try: + response = requests.post(url, headers=headers, json=payload) + response_data = response.json() + result = response_data.get("result", None) + return result + except requests.exceptions.RequestException as e: + printe.error(f"html_to_segements(): Error occurred: {e}") + return None + + +def convert_wikitext_to_html(text): + end_point = "https://en.wikipedia.org/api/rest_v1/transform/wikitext/to/html/Sandbox" + + params = {"wikitext": text} + + headers = {"Content-Type": "application/json"} + + try: + response = requests.post(end_point, headers=headers, data=json.dumps(params)) + # response.raise_for_status() # Raise an error for HTTP errors + return response.text + except requests.exceptions.RequestException as e: + printe.error(f"convert_wikitext_to_html(): Error occurred: {e}") + return None + + +def save_text(text, file): + try: + with open(file, "w", encoding="utf-8") as f: + f.write(text) + except Exception as e: + printe.error(f"save_text(): Exception: {e}") + + +def get_page_text(x, x2): + newtext, revid = get_text(x) + # --- + file = Dir + "/wikitext/" + x2 + ".txt" + # --- + save_text(newtext, file) + # --- + return newtext + + +def to_html(newtext, x, x2): + # --- + html = convert_wikitext_to_html(newtext) + # --- + file = Dir + "/html/" + x2 + ".html" + # --- + save_text(html, file) + # --- + return html + + +def to_segements(html_text, x2): + # --- + segements = html_to_segements(html_text) + # --- + file = Dir + "/segments/" + x2 + ".html" + # --- + save_text(segements, file) + # --- + return segements + + +def one_page_new(x): + x2 = x.replace(" ", "_").replace("'", "_").replace(":", "_").replace("/", "_").replace('"', "_") + # --- + text = get_page_text(x, x2) + # --- + html_text = to_html(text, x, x2) + # --- + seg_text = to_segements(html_text, x2) + + +if __name__ == "__main__": + one_page_new("Menopause") diff --git a/copy_to_en/to_file2.py b/copy_to_en/to_file2.py new file mode 100644 index 00000000..dd8aeefe --- /dev/null +++ b/copy_to_en/to_file2.py @@ -0,0 +1,203 @@ +#!/usr/bin/python3 +""" + +python3 core8/pwb.py copy_to_en/to_file2 + + +""" +import sys +import random +import json +import requests +from newapi import printe +from pathlib import Path +from multiprocessing import Pool + +from apis import cat_cach +from copy_to_en.medwiki import get_text + +dir1 = Path(__file__).parent +Dir = "/data/project/mdwiki/public_html/mdtexts" + +if str(dir1).find("I:") != -1: + Dir = "I:/mdwiki/mdwiki/public_html/mdtexts" + +Dir = Path(Dir) + + +def fix_title(x): + return x.replace(" ", "_").replace("'", "_").replace(":", "_").replace("/", "_").replace('"', "_") + + +class WikiProcessor: + def __init__(self, title): + self.base_dir = Dir + self.title = title + self.sanitized_name = fix_title(self.title) + + def html_to_segments(self, text): + url = "https://ncc2c.toolforge.org/textp" + headers = {"Content-Type": "application/json"} + payload = {"html": text} + try: + response = requests.post(url, headers=headers, json=payload) + response_data = response.json() + result = response_data.get("result", None) + return result + except requests.exceptions.RequestException as e: + printe.output(f"html_to_segments(): Error occurred: {e}") + return None + + def convert_wikitext_to_html(self, text): + end_point = "https://en.wikipedia.org/api/rest_v1/transform/wikitext/to/html/Sandbox" + params = {"wikitext": text} + headers = {"Content-Type": "application/json"} + try: + response = requests.post(end_point, headers=headers, data=json.dumps(params)) + return response.text + except requests.exceptions.RequestException as e: + printe.output(f"convert_wikitext_to_html(): Error occurred: {e}") + return None + + def save_text(self, text, file_path): + try: + with open(file_path, "w", encoding="utf-8") as f: + f.write(text) + except Exception as e: + printe.output(f"save_text(): Exception: {e}") + + def get_page_text(self, page_name): + newtext, revid = get_text(page_name) + + if not newtext: + return None + + file_path = self.base_dir / f"wikitext/{self.sanitized_name}.txt" + self.save_text(newtext, file_path) + + printe.output("<> get_page_text True.") + + return newtext + + def to_html(self, wikitext): + html = self.convert_wikitext_to_html(wikitext) + + if not html: + return None + + file_path = self.base_dir / f"html/{self.sanitized_name}.html" + self.save_text(html, file_path) + + printe.output("<> to_html True.") + return html + + def to_segments(self, html_text): + segments = self.html_to_segments(html_text) + + if not segments: + return None + + file_path = self.base_dir / f"segments/{self.sanitized_name}.html" + self.save_text(segments, file_path) + + printe.output("<> to_segments True.") + + return segments + + def run(self): + wikitext = self.get_page_text(self.title) + + if not wikitext: + printe.output("wikitext is empty..") + return + + html_text = self.to_html(wikitext) + + if html_text: + segments = self.to_segments(html_text) + + +def one_page_new(title): + bot = WikiProcessor(title) + bot.run() + + +def get_all(): + file = Dir / "all_pages.json" + # ---- + if file.exists() and "nodone" not in sys.argv: + return json.loads(file.read_text()) + # ---- + all_pages = cat_cach.make_cash_to_cats(return_all_pages=True, print_s=False) + # --- + with open(file, "w", encoding="utf-8") as f: + f.write(json.dumps(all_pages)) + # --- + return all_pages + + +def start(all_pages): + # --- + if "slash" in sys.argv: + all_pages = [x for x in all_pages if x.find("/") != -1] + # --- + # sort all_pages randmly + random.shuffle(all_pages) + # --- + if "multi" in sys.argv: + pool = Pool(processes=2) + pool.map(one_page_new, all_pages) + pool.close() + pool.terminate() + return + # --- + for n, x in enumerate(all_pages): + print(f"{n}/{len(all_pages)} : {x}") + # --- + one_page_new(x) + + +def get_done(all_pages): + # --- + all_pages = [x.replace(" ", "_") for x in all_pages] + # --- + dir_to_fetch = Dir / "segments" + # --- + files = list(dir_to_fetch.glob("*.html")) + # --- + files = [str(x.name).replace(".html", "") for x in files] + # --- + not_done = [x for x in all_pages if x not in files] + # --- + """ + for x in not_done[:]: + x2 = fix_title(x) + # --- + if x2 in files: + not_done.remove(x) + """ + # --- + return not_done + + +def main(): + # --- + all_pages = get_all() + # --- + print(f"all_pages: {len(all_pages)}") + # --- + if "nodone" not in sys.argv: + done = get_done(all_pages) + # --- + print(f" done: {len(done)}. add 'nodone' to sys.argv to skip find done pages.") + # --- + all_pages = [x for x in all_pages if x not in done] + # --- + start(all_pages) + + +if __name__ == "__main__": + if "test" in sys.argv: + one_page_new("Menopause") + else: + main() diff --git a/md_core_helps/apis/cat_cach.py b/md_core_helps/apis/cat_cach.py index 8f651f78..9a7049ab 100644 --- a/md_core_helps/apis/cat_cach.py +++ b/md_core_helps/apis/cat_cach.py @@ -1,15 +1,17 @@ #!/usr/bin/python3 """ + python3 core8/pwb.py apis/cat_cach from apis import cat_cach +from apis/cat_cach import Cat_Depth all_pages = cat_cach.make_cash_to_cats(return_all_pages=True) """ import time from datetime import datetime -from pathlib import Path +# from pathlib import Path from mdapi_sql import sql_for_mdwiki from mdpy.bots.check_title import valid_title from newapi import printe diff --git a/wprefs/wpref_text.py b/wprefs/wpref_text.py index 60e7d247..b563cf99 100644 --- a/wprefs/wpref_text.py +++ b/wprefs/wpref_text.py @@ -3,10 +3,6 @@ from wprefs.wpref_text import fix_page # fix_page(text, title) """ -# -# (C) Ibrahem Qasim, 2023 -# -# import re import sys @@ -20,27 +16,27 @@ from wprefs.infobox import Expend_Infobox -def add_lang_en(text, lang=''): +def add_lang_en(text, lang=""): # --- - print_s('add_lang_en:') + print_s("add_lang_en:") # --- # Match references - REFS = re.compile(r'(?is)(?P\/]*>)(?P.*?<\/ref>)') + REFS = re.compile(r"(?is)(?P\/]*>)(?P.*?<\/ref>)") # --- for Match in REFS.finditer(text): - pap = Match.group('pap') - ref = Match.group('ref') + pap = Match.group("pap") + ref = Match.group("ref") # --- if not ref.strip(): continue # --- - if re.sub(r'\|\s*language\s*\=\s*\w+', '', ref) != ref: + if re.sub(r"\|\s*language\s*\=\s*\w+", "", ref) != ref: continue # --- - ref2 = re.sub(r'(\|\s*language\s*\=\s*)(\|\}\})', r'\g<1>en\g<2>', ref) + ref2 = re.sub(r"(\|\s*language\s*\=\s*)(\|\}\})", r"\g<1>en\g<2>", ref) # --- if ref2 == ref: - ref2 = ref.replace('}}', '|language=en}}') + ref2 = ref.replace("}}", "|language=en}}") # --- if ref2 != ref: text = text.replace(pap + ref, pap + ref2) @@ -50,62 +46,65 @@ def add_lang_en(text, lang=''): def remove_False_code(text): # --- - if 'newcite' not in sys.argv: + if "newcite" not in sys.argv: return text # --- # }}Smith, N; Lopez, RA; Silberman, M (January 2020). "Distributive Shock". [[PMID (identifier)|PMID]] [//pubmed.ncbi.nlm.nih.gov/29261964 29261964]. {{[[Template:cite journal|cite journal]]}}: Cite journal requires |journal= ([[Help:CS1 errors#missing_periodical|help]]) # --- # Match references - REFS = re.compile(r'(?is)(?P\/]*>)(?P.*?<\/ref>)') + REFS = re.compile(r"(?is)(?P\/]*>)(?P.*?<\/ref>)") # --- for Match in REFS.finditer(text): - pap = Match.group('pap') - ref = Match.group('ref') + pap = Match.group("pap") + ref = Match.group("ref") if not ref.strip(): # pywikibot.output( "\tno ref" ) continue # --- # find html code like and all span code after it # find and get html code like - if not re.search(r'(?is)]*>', ref): + if not re.search(r"(?is)]*>", ref): # pywikibot.output( "\tno cite" ) continue # --- # find and get html code like ref2 = ref - if ref.find(']*>.*?<\/span>+?)', '', ref2) - ref2 = re.sub(r']*>.*?<\/cite>', '', ref2) + if ref.find("]*>.*?<\/span>+?)", "", ref2) + ref2 = re.sub(r"]*>.*?<\/cite>", "", ref2) # --- - if ref2 != ref and ref2.strip() != '': + if ref2 != ref and ref2.strip() != "": text = text.replace(pap + ref, pap + ref2) # --- return text -def move_dots_text(newtext, lang=''): +def move_dots_text(newtext, lang=""): # --- - dot = r'(\.|\,)' + dot = r"(\.|\,)" # --- - if lang == 'zh': - dot = '(。)' + if lang == "zh": + dot = "(。)" # --- - regline = r'((?:\s*)+)' + regline = r"((?:\s*)+)" # --- - newtext = re.sub(f"{dot}\\s*{regline}", r'\g<2>\g<1>', newtext) + newtext = re.sub(f"{dot}\\s*{regline}", r"\g<2>\g<1>", newtext) # --- return newtext -def fix_page(newtext, title, move_dots=False, infobox=False, section_0='', lang='', add_en_lang=False): +def fix_page(newtext, title, move_dots=False, infobox=False, section_0="", lang="", add_en_lang=False): # --- - print_s(f'fix page: {title}, move_dots:{move_dots}, expend_infobox:{infobox}') + print_s(f"fix page: {title}, move_dots:{move_dots}, expend_infobox:{infobox}") # --- if infobox: newtext = Expend_Infobox(newtext, title, section_0) # --- newtext = remove_False_code(newtext) # --- + # match and delete any text like {{mdwiki revid|555}} + newtext = re.sub(r"\{\{mdwiki[ _]revid\s*\|\s*(\d+)\s*\}\}", "", newtext, flags=re.IGNORECASE) + # --- newtext = DuplicateReferences(newtext) # --- if move_dots: @@ -114,10 +113,10 @@ def fix_page(newtext, title, move_dots=False, infobox=False, section_0='', lang= if add_en_lang: newtext = add_lang_en(newtext, lang=lang) # --- - if lang == 'pt': + if lang == "pt": newtext = pt_months(newtext) # --- - if lang == 'es': + if lang == "es": newtext = fix_es(newtext, title) # --- return newtext