Skip to content

Commit

Permalink
1
Browse files Browse the repository at this point in the history
  • Loading branch information
MrIbrahem committed Dec 11, 2024
1 parent af9ebc3 commit 73d9b31
Show file tree
Hide file tree
Showing 9 changed files with 821 additions and 32 deletions.
7 changes: 7 additions & 0 deletions copy_text/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# find : \s*\n(\s*#\s*[-]+\s*)+\n*^def
# find : \s*\n(#\s*[-]+\s*)+\n*def
# replace : \n\n\ndef


# find : \s*\n(\s*#\s*[-]+\s*)+\n*^(\s*def )
# replace : \n\n\n$2
90 changes: 90 additions & 0 deletions copy_text/files_list.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
#!/usr/bin/python3
"""
python3 core8/pwb.py copy_text/files_list
"""
import sys
import json
from pathlib import Path
from newapi import printe
from mdapi_sql import sql_for_mdwiki
from apis.cat_cach import Cat_Depth

dir1 = Path(__file__).parent
Dir = "/data/project/mdwiki/public_html/mdtexts"

if str(dir1).find("I:") != -1:
Dir = "I:/mdwiki/mdwiki/public_html/mdtexts"

Dir = Path(Dir)


def fix_title(x):
return x.replace(" ", "_").replace("'", "_").replace(":", "_").replace("/", "_").replace('"', "_")


def cats_pages():
# ---
all_pages = []
to_cats = {}
# ---
cats = sql_for_mdwiki.get_db_categories()
# ---
# sort cats making RTT the last item
cats = dict(sorted(cats.items(), key=lambda x: x[0] == "RTT"))
# ---
for cat, depth in cats.items():
# ---
ca = Cat_Depth(cat, depth=depth, ns="all", print_s=False)
# ---
ca_list = [x for x in ca if x not in all_pages]
# ---
printe.output(f"<<green>> ca_list({cat}): {len(ca_list)}")
# ---
to_cats[cat] = ca_list
# ---
all_pages.extend(ca_list)
# ---
return to_cats


def dump_titles(titles):
# ---
file = Dir / "cats_titles.json"
# ---
data = {}
# ---
if file.exists():
# read data
with open(file, "r", encoding="utf-8") as f:
data = json.load(f)
printe.output(f"<<yellow>> titles in titles.json: {len(data)}")
# ---
for cat, cat_titles in titles.items():
new_data = {x: fix_title(x) for x in cat_titles if x not in data.get(cat, [])}
printe.output(f"<<yellow>> cat_titles({cat}) in new_data: {len(new_data)}")
# ---
data.setdefault(cat, {})
# ---
# merge data
data[cat].update(new_data)
# ---
with open(file, "w", encoding="utf-8") as f:
json.dump(data, f, indent=4, ensure_ascii=False)
# ---
return


def main():
# ---
all_pages = cats_pages()
# ---
print(f"all_pages: {len(all_pages)}")
# ---
dump_titles(all_pages)


if __name__ == "__main__":
main()
46 changes: 46 additions & 0 deletions copy_text/scan_files.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#!/usr/bin/python3
"""
python3 core8/pwb.py copy_text/scan_files
"""
import sys
import tqdm
import os
from pathlib import Path
from newapi import printe

dir1 = Path(__file__).parent
Dir = "/data/project/mdwiki/public_html/mdtexts"

if str(dir1).find("I:") != -1:
Dir = "I:/mdwiki/mdwiki/public_html/mdtexts"

Dir = Path(Dir)

paths = [
Dir / "html",
Dir / "segments",
]

to_del = []

for path in paths:
files = list(path.glob("*.html"))

for n, file in tqdm.tqdm(enumerate(files, 1), total=len(files)):
# printe.output(f"<<yellow>> f: {n}/{len(files)} : {file}")

with open(file, "r", encoding="utf-8") as f:
text = f.read()

if text.find(">Wikimedia Error<") != -1:
to_del.append(file)

for n, file in enumerate(to_del, 1):
printe.output(f"<<red>> f: {n}/{len(to_del)} : Error: {file}")
# del the file
if "del" in sys.argv:
os.remove(file)
continue
120 changes: 120 additions & 0 deletions copy_text/text_bot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
#!/usr/bin/python3
"""
"""
import re
from apis import mdwiki_api
from newapi.super import super_page
from newapi.super import catdepth_new

from copy_to_en.bots import medwiki_account
from copy_to_en.bots import text_changes # text = text_changes.work(text)
from copy_to_en.bots.ref import fix_ref # text = fix_ref(first, alltext)
from mdapi_sql import sql_for_mdwiki

# ---
User_tables = {
"username": medwiki_account.username,
"password": medwiki_account.password,
}
# ---
catdepth_new.User_tables["toolforge"] = User_tables
super_page.User_tables["toolforge"] = User_tables
# ---
CatDepth = catdepth_new.subcatquery
MainPage = super_page.MainPage

text_cache = {}
revid_cache = {}
un_wb_tag_cache = {}

mdwiki_cats = sql_for_mdwiki.get_db_categories()


def get_cats(alltext):
# ---
cats = []
# ---
for category in mdwiki_cats:
# ---
mat = re.search(rf"\[\[Category:{category}(\]\]|\|)", alltext, re.IGNORECASE)
# ---
if mat:
cats.append(category)
# ---
cats = list(set(cats))
# ---
# if len(cats) > 1 and "RTT" in cats: cats.remove("RTT")
# ---
cats_text = "\n".join([f"[[Category:{x}]]" for x in cats])
# ---
return cats_text


def get_text_revid(x):
alltext, revid = mdwiki_api.GetPageText(x, get_revid=True)
# ---
text_cache[x] = alltext
revid_cache[x] = revid
# ---
return alltext, revid


def get_un_wb_tag(alltext, x):
# search for text like {{#unlinkedwikibase:id=Q423364}}
pattern = r"\{\{#unlinkedwikibase:id=Q[0-9]+\}\}"
# ---
match = re.search(pattern, alltext)
# ---
unlinkedwikibase = match.group(0) if match else ""
# ---
# matches = re.findall(pattern, alltext)
# for m in matches:
# unlinkedwikibase = m
# break
# ---
un_wb_tag_cache[x] = unlinkedwikibase
# ---
return unlinkedwikibase


def get_text(x):
"""Retrieve and process text from a specified page.
This function fetches the text content of a page using the
`mdwiki_api.GetPageText` method. It processes the retrieved text to
extract and format specific information, including handling unlinked
Wikibase IDs and adjusting the infobox formatting. The function also
ensures that references are properly formatted and included in the
output.
Args:
x (str): The identifier of the page from which to retrieve text.
Returns:
tuple: A tuple containing the processed text and the revision ID
of the page.
"""
alltext, revid = get_text_revid(x)
# ---
if not alltext:
print("no text: " + x)
return "", ""
# ---
page_cats = get_cats(alltext)
# ---
unlinkedwikibase = get_un_wb_tag(alltext, x)
# ---
first = alltext.split("==")[0].strip()
# ---
first = first + "\n\n==References==\n<references />"
newtext = first
# ---
newtext = fix_ref(first, alltext)
# ---
newtext = text_changes.do_text_fixes(newtext)
# ---
newtext += "\n[[Category:Mdwiki Translation Dashboard articles]]"
# ---
revid_temp = f"{{{{mdwiki revid|{revid}}}}}"
# ---
newtext = f"{unlinkedwikibase}\n{revid_temp}\n{newtext}\n{page_cats}"
# ---
return newtext, revid
Loading

0 comments on commit 73d9b31

Please sign in to comment.