Skip to content

Commit

Permalink
Merge pull request #129 from MrIbrahem/update
Browse files Browse the repository at this point in the history
Update
  • Loading branch information
MrIbrahem authored Apr 14, 2024
2 parents 970b1c4 + 4c74dba commit 122a27c
Show file tree
Hide file tree
Showing 9 changed files with 383 additions and 99 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,4 @@ mass/radiox/**
ncc_core/nc_import/long_bots.py
/md_core/newapix
/mass/eyerounds/htmls
/fix_mass/fix_sets/studies_done
22 changes: 22 additions & 0 deletions fix_mass/fix_sets/bots/done.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
"""
from fix_mass.fix_sets.bots.done import studies_done_append, find_done #find_done(study_id)
"""
from pathlib import Path

Dir = Path(__file__).parent.parent

studies_done_dir = Dir / "studies_done"

def find_done(study_id):
file = studies_done_dir / f"{study_id}.done"
if file.exists():
return True
return False


def studies_done_append(study_id):
file = studies_done_dir / f"{study_id}.done"
if not file.exists():
file.touch()
128 changes: 87 additions & 41 deletions fix_mass/fix_sets/bots/get_img_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,31 +21,22 @@

st_dic_infos = Dir / "jsons/studies_files_infos"

def dump_st(data, s_id):
file = st_dic_infos / f"{s_id}_s_id.json"

def dump_st(data, file):

with open(file, "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=2)
printe.output(f"<<green>> write {len(data)} to file: {file}")

def gt_img_info(title):

def gt_img_info(titles, id_to_url={}):
# ---
title = [title] if not isinstance(title, list) else title
titles = [titles] if not isinstance(titles, list) else titles
# ---
info = {}
printe.output(f"one_img_info: {len(title)=}")
titles = [x for x in titles if x]
# ---
params = {
"action": "query",
"titles": "|".join(title),
# "prop": "revisions|categories|info|extlinks",
"prop": "extlinks",
# "clprop": "sortkey|hidden", # categories
# "rvprop": "timestamp|content|user|ids", # revisions
# "cllimit": "max", # categories
"ellimit": "max", # extlinks
"formatversion": "2",
}
data = api_new.post_params(params)
info = {}
printe.output(f"one_img_info: {len(titles)=}")
# ---
_x = {
"pages": [
Expand All @@ -65,47 +56,102 @@ def gt_img_info(title):
]
}
# ---
pages = data.get("query", {}).get("pages", [])
params = {
"action": "query",
# "titles": "|".join(titles),
# "prop": "revisions|categories|info|extlinks",
"prop": "revisions|extlinks",
# "clprop": "sortkey|hidden", # categories
"rvprop": "content", # revisions
# "cllimit": "max", # categories
"ellimit": "max", # extlinks
"formatversion": "2",
}
# ---
for page in pages:
extlinks = page.get("extlinks", [])
title = page.get("title")
# work with 40 titles at once
for i in range(0, len(titles), 40):
group = titles[i : i + 40]
params["titles"] = "|".join(group)
# ---
info[title] = {"img_url": "", "case_url": "", "study_url": "", "caseId": "", "studyId": ""}
# print("|".join(group))
# ---
for extlink in extlinks:
url = extlink.get("url")
ma = re.match("https://radiopaedia.org/cases/(\d+)/studies/(\d+)", url)
if url.find("/images/") != -1:
info[title]["img_url"] = url

elif re.match(r"^https://radiopaedia.org/cases/[^\d\/]+$", url):
info[title]["case_url"] = url

elif ma:
info[title]["study_url"] = url
info[title]["caseId"] = ma.group(1)
info[title]["studyId"] = ma.group(2)
data = api_new.post_params(params)
# ---
error = data.get("error", {})
if error:
printe.output(json.dumps(error, indent=2))
# ---
pages = data.get("query", {}).get("pages", [])
# ---
for page in pages:
extlinks = page.get("extlinks", [])
title = page.get("title")
# ---
# info[title] = {"img_url": "", "case_url": "", "study_url": "", "caseId": "", "studyId": "", "img_id": ""}
info[title] = {"img_url": "", "img_id": ""}
# ---
for extlink in extlinks:
url = extlink.get("url")
# ma = re.match("https://radiopaedia.org/cases/(\d+)/studies/(\d+)", url)
if url.find("/images/") != -1:
info[title]["img_url"] = url

# elif re.match(r"^https://radiopaedia.org/cases/[^\d\/]+$", url):
# info[title]["case_url"] = url

# elif ma:
# info[title]["study_url"] = url
# info[title]["caseId"] = ma.group(1)
# info[title]["studyId"] = ma.group(2)
# ---
revisions = page.get("revisions")
if info[title]["img_url"]:
continue
# ---
if not revisions:
continue
# ---
revisions = revisions[0]["content"]
# match * Image ID: 58331091 in revisions.split("\n")
ma = re.search(r"Image ID: (\d+)", revisions)
if ma:
info[title]["img_id"] = ma.group(1)
info[title]["img_url"] = id_to_url.get(str(ma.group(1)), "")
else:
print(revisions)
# ---
# printe.output(json.dumps(pages, indent=2))
# ---
return info


def one_img_info(title, study_id):
def one_img_info(title, study_id, json_data):
# ---
info = gt_img_info(title)
file = st_dic_infos / f"{study_id}_s_id.json"
# ---
if file.exists():
printe.output(f"<<green>> one_img_info: {file} exists")
with open(file, encoding="utf-8") as f:
return json.load(f)
# ---
id_to_url = {}
# ---
for x in json_data:
for n, image in enumerate(x["images"], start=1):
id_to_url[str(image["id"])] = image["public_filename"]
# ---
info = gt_img_info(title, id_to_url)
# ---
# printe.output(json.dumps(pages, indent=2))
# ---
dump_st(info, study_id)
dump_st(info, file)
# ---
return info


def test():
title = ["File:Appendicitis (CT angiogram) (Radiopaedia 154713-134732 This comic explains the pathophysiology of appendicitis. 4).jpg", "File:Appendicitis (CT angiogram) (Radiopaedia 154713-134732 This comic explains the pathophysiology of appendicitis. 2).jpg"]
info = one_img_info(title)
title = ["File:1st metatarsal head fracture (Radiopaedia 99187-120594 Frontal 1).png", "File:Appendicitis (CT angiogram) (Radiopaedia 154713-134732 This comic explains the pathophysiology of appendicitis. 02).jpg"]
info = gt_img_info(title)
# ---
print(json.dumps(info, indent=2))
# ---
Expand Down
100 changes: 100 additions & 0 deletions fix_mass/fix_sets/bots/mv_files.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
"""
s
from fix_mass.fix_sets.bots.mv_files import to_move_work
"""
import re
import json
import sys
from pathlib import Path
from newapi import printe

Dir = Path(__file__).parent.parent

st_dit = Dir / "jsons/studies_files"

from newapi.ncc_page import NEW_API

api_new = NEW_API("www", family="nccommons")
api_new.Login_to_wiki()


def dump_it(data):
for s_id, files in data.items():
with open(st_dit / f"{s_id}.json", "w", encoding="utf-8") as f:
json.dump(files, f, ensure_ascii=False, indent=2)
printe.output(f"<<green>> write {len(files)} to {s_id}.json")


def change_names(file_dict):
modified_file_dict = {}
new_t = []

for key, value in file_dict.items():
new_key = f"0{key}"

# new_filename = value.replace(value[value.rfind(" ") + 1 : value.find(").jpg")], new_key)
ma = re.match(r"^(.*?) \d+(\)\.\w+)$", value)
if not ma:
modified_file_dict[value] = value
continue
# ---
new_filename = ma.group(1) + " " + new_key + ma.group(2)
# ---
if new_filename in new_t:
printe.output(f"duplicte: {new_filename}")
return False

modified_file_dict[value] = new_filename

new_t.append(new_filename)

return modified_file_dict


def mv_file(old, new):
if "mv_test" in sys.argv:
return True
move_it = api_new.move(old, new, reason="")
return move_it


def mv_files_change_text(text, tab):

n_text = text
# ---
for old, new in tab.items():
# ---
mv = mv_file(old, new)
# ---
if mv:
n_text = n_text.replace(old, new)
# ---
return n_text


def to_move_work(text, to_move):
# ---
new_text = text
# ---
if "mv" in sys.argv:
for ty, files in to_move.items():
# ---
# if any file start with http return text
if any(x.startswith("http") for x in files.values()):
printe.output(f"<<red>> {ty} {len(files)} x.startswith(http)")
return text
# ---
printe.output(f"<<blue>> {ty} {len(files)}")
# printe.output(files)
# ---
neww = change_names(files)
# ---
if neww:
# ---
new_text = mv_files_change_text(new_text, neww)
# printe.output(json.dumps(neww, indent=2))
# ---
text = new_text
# ---
return text
Loading

0 comments on commit 122a27c

Please sign in to comment.