Merge pull request #188 from Mdwiki-TD/update

Update
Mdwiki-TD · Dec 11, 2024 · b7df15e · b7df15e
2 parents b8057ca + 73d9b31
commit b7df15e
Show file tree

Hide file tree

Showing 10 changed files with 852 additions and 58 deletions.
diff --git a/copy_text/__init__.py b/copy_text/__init__.py
@@ -0,0 +1,7 @@
+# find      :   \s*\n(\s*#\s*[-]+\s*)+\n*^def
+# find      :   \s*\n(#\s*[-]+\s*)+\n*def
+# replace   :   \n\n\ndef
+
+
+# find      :   \s*\n(\s*#\s*[-]+\s*)+\n*^(\s*def )
+# replace   :   \n\n\n$2
diff --git a/copy_text/files_list.py b/copy_text/files_list.py
@@ -0,0 +1,90 @@
+#!/usr/bin/python3
+"""
+
+python3 core8/pwb.py copy_text/files_list
+
+
+"""
+import sys
+import json
+from pathlib import Path
+from newapi import printe
+from mdapi_sql import sql_for_mdwiki
+from apis.cat_cach import Cat_Depth
+
+dir1 = Path(__file__).parent
+Dir = "/data/project/mdwiki/public_html/mdtexts"
+
+if str(dir1).find("I:") != -1:
+    Dir = "I:/mdwiki/mdwiki/public_html/mdtexts"
+
+Dir = Path(Dir)
+
+
+def fix_title(x):
+    return x.replace(" ", "_").replace("'", "_").replace(":", "_").replace("/", "_").replace('"', "_")
+
+
+def cats_pages():
+    # ---
+    all_pages = []
+    to_cats = {}
+    # ---
+    cats = sql_for_mdwiki.get_db_categories()
+    # ---
+    # sort cats making RTT the last item
+    cats = dict(sorted(cats.items(), key=lambda x: x[0] == "RTT"))
+    # ---
+    for cat, depth in cats.items():
+        # ---
+        ca = Cat_Depth(cat, depth=depth, ns="all", print_s=False)
+        # ---
+        ca_list = [x for x in ca if x not in all_pages]
+        # ---
+        printe.output(f"<<green>> ca_list({cat}): {len(ca_list)}")
+        # ---
+        to_cats[cat] = ca_list
+        # ---
+        all_pages.extend(ca_list)
+    # ---
+    return to_cats
+
+
+def dump_titles(titles):
+    # ---
+    file = Dir / "cats_titles.json"
+    # ---
+    data = {}
+    # ---
+    if file.exists():
+        # read data
+        with open(file, "r", encoding="utf-8") as f:
+            data = json.load(f)
+            printe.output(f"<<yellow>> titles in titles.json: {len(data)}")
+    # ---
+    for cat, cat_titles in titles.items():
+        new_data = {x: fix_title(x) for x in cat_titles if x not in data.get(cat, [])}
+        printe.output(f"<<yellow>> cat_titles({cat}) in new_data: {len(new_data)}")
+        # ---
+        data.setdefault(cat, {})
+        # ---
+        # merge data
+        data[cat].update(new_data)
+    # ---
+    with open(file, "w", encoding="utf-8") as f:
+        json.dump(data, f, indent=4, ensure_ascii=False)
+    # ---
+    return
+
+
+def main():
+    # ---
+    all_pages = cats_pages()
+    # ---
+    print(f"all_pages: {len(all_pages)}")
+    # ---
+    dump_titles(all_pages)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/copy_text/scan_files.py b/copy_text/scan_files.py
@@ -0,0 +1,46 @@
+#!/usr/bin/python3
+"""
+
+python3 core8/pwb.py copy_text/scan_files
+
+
+"""
+import sys
+import tqdm
+import os
+from pathlib import Path
+from newapi import printe
+
+dir1 = Path(__file__).parent
+Dir = "/data/project/mdwiki/public_html/mdtexts"
+
+if str(dir1).find("I:") != -1:
+    Dir = "I:/mdwiki/mdwiki/public_html/mdtexts"
+
+Dir = Path(Dir)
+
+paths = [
+    Dir / "html",
+    Dir / "segments",
+]
+
+to_del = []
+
+for path in paths:
+    files = list(path.glob("*.html"))
+
+    for n, file in tqdm.tqdm(enumerate(files, 1), total=len(files)):
+        # printe.output(f"<<yellow>> f: {n}/{len(files)} : {file}")
+
+        with open(file, "r", encoding="utf-8") as f:
+            text = f.read()
+
+        if text.find(">Wikimedia Error<") != -1:
+            to_del.append(file)
+
+for n, file in enumerate(to_del, 1):
+    printe.output(f"<<red>> f: {n}/{len(to_del)} : Error: {file}")
+    # del the file
+    if "del" in sys.argv:
+        os.remove(file)
+    continue
diff --git a/copy_text/text_bot.py b/copy_text/text_bot.py
@@ -0,0 +1,120 @@
+#!/usr/bin/python3
+"""
+
+"""
+import re
+from apis import mdwiki_api
+from newapi.super import super_page
+from newapi.super import catdepth_new
+
+from copy_to_en.bots import medwiki_account
+from copy_to_en.bots import text_changes  # text = text_changes.work(text)
+from copy_to_en.bots.ref import fix_ref  # text = fix_ref(first, alltext)
+from mdapi_sql import sql_for_mdwiki
+
+# ---
+User_tables = {
+    "username": medwiki_account.username,
+    "password": medwiki_account.password,
+}
+# ---
+catdepth_new.User_tables["toolforge"] = User_tables
+super_page.User_tables["toolforge"] = User_tables
+# ---
+CatDepth = catdepth_new.subcatquery
+MainPage = super_page.MainPage
+
+text_cache = {}
+revid_cache = {}
+un_wb_tag_cache = {}
+
+mdwiki_cats = sql_for_mdwiki.get_db_categories()
+
+
+def get_cats(alltext):
+    # ---
+    cats = []
+    # ---
+    for category in mdwiki_cats:
+        # ---
+        mat = re.search(rf"\[\[Category:{category}(\]\]|\|)", alltext, re.IGNORECASE)
+        # ---
+        if mat:
+            cats.append(category)
+    # ---
+    cats = list(set(cats))
+    # ---
+    # if len(cats) > 1 and "RTT" in cats: cats.remove("RTT")
+    # ---
+    cats_text = "\n".join([f"[[Category:{x}]]" for x in cats])
+    # ---
+    return cats_text
+
+
+def get_text_revid(x):
+    alltext, revid = mdwiki_api.GetPageText(x, get_revid=True)
+    # ---
+    text_cache[x] = alltext
+    revid_cache[x] = revid
+    # ---
+    return alltext, revid
+
+
+def get_un_wb_tag(alltext, x):
+    # search for text like {{#unlinkedwikibase:id=Q423364}}
+    pattern = r"\{\{#unlinkedwikibase:id=Q[0-9]+\}\}"
+    # ---
+    match = re.search(pattern, alltext)
+    # ---
+    unlinkedwikibase = match.group(0) if match else ""
+    # ---
+    # matches = re.findall(pattern, alltext)
+    # for m in matches:
+    #     unlinkedwikibase = m
+    #     break
+    # ---
+    un_wb_tag_cache[x] = unlinkedwikibase
+    # ---
+    return unlinkedwikibase
+
+
+def get_text(x):
+    """Retrieve and process text from a specified page.
+    This function fetches the text content of a page using the
+    `mdwiki_api.GetPageText` method. It processes the retrieved text to
+    extract and format specific information, including handling unlinked
+    Wikibase IDs and adjusting the infobox formatting. The function also
+    ensures that references are properly formatted and included in the
+    output.
+    Args:
+        x (str): The identifier of the page from which to retrieve text.
+    Returns:
+        tuple: A tuple containing the processed text and the revision ID
+        of the page.
+    """
+    alltext, revid = get_text_revid(x)
+    # ---
+    if not alltext:
+        print("no text: " + x)
+        return "", ""
+    # ---
+    page_cats = get_cats(alltext)
+    # ---
+    unlinkedwikibase = get_un_wb_tag(alltext, x)
+    # ---
+    first = alltext.split("==")[0].strip()
+    # ---
+    first = first + "\n\n==References==\n<references />"
+    newtext = first
+    # ---
+    newtext = fix_ref(first, alltext)
+    # ---
+    newtext = text_changes.do_text_fixes(newtext)
+    # ---
+    newtext += "\n[[Category:Mdwiki Translation Dashboard articles]]"
+    # ---
+    revid_temp = f"{{{{mdwiki revid|{revid}}}}}"
+    # ---
+    newtext = f"{unlinkedwikibase}\n{revid_temp}\n{newtext}\n{page_cats}"
+    # ---
+    return newtext, revid