Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update #129

Merged
merged 6 commits into from
Apr 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,4 @@ mass/radiox/**
ncc_core/nc_import/long_bots.py
/md_core/newapix
/mass/eyerounds/htmls
/fix_mass/fix_sets/studies_done
22 changes: 22 additions & 0 deletions fix_mass/fix_sets/bots/done.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
"""

from fix_mass.fix_sets.bots.done import studies_done_append, find_done #find_done(study_id)

"""
from pathlib import Path

Dir = Path(__file__).parent.parent

studies_done_dir = Dir / "studies_done"

def find_done(study_id):
file = studies_done_dir / f"{study_id}.done"
if file.exists():
return True
return False


def studies_done_append(study_id):
file = studies_done_dir / f"{study_id}.done"
if not file.exists():
file.touch()
128 changes: 87 additions & 41 deletions fix_mass/fix_sets/bots/get_img_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,31 +21,22 @@

st_dic_infos = Dir / "jsons/studies_files_infos"

def dump_st(data, s_id):
file = st_dic_infos / f"{s_id}_s_id.json"

def dump_st(data, file):

with open(file, "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=2)
printe.output(f"<<green>> write {len(data)} to file: {file}")

def gt_img_info(title):

def gt_img_info(titles, id_to_url={}):
# ---
title = [title] if not isinstance(title, list) else title
titles = [titles] if not isinstance(titles, list) else titles
# ---
info = {}
printe.output(f"one_img_info: {len(title)=}")
titles = [x for x in titles if x]
# ---
params = {
"action": "query",
"titles": "|".join(title),
# "prop": "revisions|categories|info|extlinks",
"prop": "extlinks",
# "clprop": "sortkey|hidden", # categories
# "rvprop": "timestamp|content|user|ids", # revisions
# "cllimit": "max", # categories
"ellimit": "max", # extlinks
"formatversion": "2",
}
data = api_new.post_params(params)
info = {}
printe.output(f"one_img_info: {len(titles)=}")
# ---
_x = {
"pages": [
Expand All @@ -65,47 +56,102 @@ def gt_img_info(title):
]
}
# ---
pages = data.get("query", {}).get("pages", [])
params = {
"action": "query",
# "titles": "|".join(titles),
# "prop": "revisions|categories|info|extlinks",
"prop": "revisions|extlinks",
# "clprop": "sortkey|hidden", # categories
"rvprop": "content", # revisions
# "cllimit": "max", # categories
"ellimit": "max", # extlinks
"formatversion": "2",
}
# ---
for page in pages:
extlinks = page.get("extlinks", [])
title = page.get("title")
# work with 40 titles at once
for i in range(0, len(titles), 40):
group = titles[i : i + 40]
params["titles"] = "|".join(group)
# ---
info[title] = {"img_url": "", "case_url": "", "study_url": "", "caseId": "", "studyId": ""}
# print("|".join(group))
# ---
for extlink in extlinks:
url = extlink.get("url")
ma = re.match("https://radiopaedia.org/cases/(\d+)/studies/(\d+)", url)
if url.find("/images/") != -1:
info[title]["img_url"] = url

elif re.match(r"^https://radiopaedia.org/cases/[^\d\/]+$", url):
info[title]["case_url"] = url

elif ma:
info[title]["study_url"] = url
info[title]["caseId"] = ma.group(1)
info[title]["studyId"] = ma.group(2)
data = api_new.post_params(params)
# ---
error = data.get("error", {})
if error:
printe.output(json.dumps(error, indent=2))
# ---
pages = data.get("query", {}).get("pages", [])
# ---
for page in pages:
extlinks = page.get("extlinks", [])
title = page.get("title")
# ---
# info[title] = {"img_url": "", "case_url": "", "study_url": "", "caseId": "", "studyId": "", "img_id": ""}
info[title] = {"img_url": "", "img_id": ""}
# ---
for extlink in extlinks:
url = extlink.get("url")
# ma = re.match("https://radiopaedia.org/cases/(\d+)/studies/(\d+)", url)
if url.find("/images/") != -1:
info[title]["img_url"] = url

# elif re.match(r"^https://radiopaedia.org/cases/[^\d\/]+$", url):
# info[title]["case_url"] = url

# elif ma:
# info[title]["study_url"] = url
# info[title]["caseId"] = ma.group(1)
# info[title]["studyId"] = ma.group(2)
# ---
revisions = page.get("revisions")
if info[title]["img_url"]:
continue
# ---
if not revisions:
continue
# ---
revisions = revisions[0]["content"]
# match * Image ID: 58331091 in revisions.split("\n")
ma = re.search(r"Image ID: (\d+)", revisions)
if ma:
info[title]["img_id"] = ma.group(1)
info[title]["img_url"] = id_to_url.get(str(ma.group(1)), "")
else:
print(revisions)
# ---
# printe.output(json.dumps(pages, indent=2))
# ---
return info


def one_img_info(title, study_id):
def one_img_info(title, study_id, json_data):
# ---
info = gt_img_info(title)
file = st_dic_infos / f"{study_id}_s_id.json"
# ---
if file.exists():
printe.output(f"<<green>> one_img_info: {file} exists")
with open(file, encoding="utf-8") as f:
return json.load(f)
# ---
id_to_url = {}
# ---
for x in json_data:
for n, image in enumerate(x["images"], start=1):
id_to_url[str(image["id"])] = image["public_filename"]
# ---
info = gt_img_info(title, id_to_url)
# ---
# printe.output(json.dumps(pages, indent=2))
# ---
dump_st(info, study_id)
dump_st(info, file)
# ---
return info


def test():
title = ["File:Appendicitis (CT angiogram) (Radiopaedia 154713-134732 This comic explains the pathophysiology of appendicitis. 4).jpg", "File:Appendicitis (CT angiogram) (Radiopaedia 154713-134732 This comic explains the pathophysiology of appendicitis. 2).jpg"]
info = one_img_info(title)
title = ["File:1st metatarsal head fracture (Radiopaedia 99187-120594 Frontal 1).png", "File:Appendicitis (CT angiogram) (Radiopaedia 154713-134732 This comic explains the pathophysiology of appendicitis. 02).jpg"]
info = gt_img_info(title)
# ---
print(json.dumps(info, indent=2))
# ---
Expand Down
100 changes: 100 additions & 0 deletions fix_mass/fix_sets/bots/mv_files.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
"""
s
from fix_mass.fix_sets.bots.mv_files import to_move_work

"""
import re
import json
import sys
from pathlib import Path
from newapi import printe

Dir = Path(__file__).parent.parent

st_dit = Dir / "jsons/studies_files"

from newapi.ncc_page import NEW_API
MrIbrahem marked this conversation as resolved.
Show resolved Hide resolved

api_new = NEW_API("www", family="nccommons")
api_new.Login_to_wiki()


def dump_it(data):
for s_id, files in data.items():
with open(st_dit / f"{s_id}.json", "w", encoding="utf-8") as f:
json.dump(files, f, ensure_ascii=False, indent=2)
printe.output(f"<<green>> write {len(files)} to {s_id}.json")


def change_names(file_dict):
modified_file_dict = {}
new_t = []

for key, value in file_dict.items():
new_key = f"0{key}"

# new_filename = value.replace(value[value.rfind(" ") + 1 : value.find(").jpg")], new_key)
ma = re.match(r"^(.*?) \d+(\)\.\w+)$", value)
if not ma:
modified_file_dict[value] = value
continue
# ---
new_filename = ma.group(1) + " " + new_key + ma.group(2)
# ---
if new_filename in new_t:
printe.output(f"duplicte: {new_filename}")
return False

modified_file_dict[value] = new_filename

new_t.append(new_filename)

return modified_file_dict
MrIbrahem marked this conversation as resolved.
Show resolved Hide resolved


def mv_file(old, new):
if "mv_test" in sys.argv:
return True
move_it = api_new.move(old, new, reason="")
return move_it


def mv_files_change_text(text, tab):

n_text = text
# ---
for old, new in tab.items():
# ---
mv = mv_file(old, new)
# ---
if mv:
n_text = n_text.replace(old, new)
# ---
return n_text


def to_move_work(text, to_move):
# ---
new_text = text
# ---
if "mv" in sys.argv:
for ty, files in to_move.items():
# ---
# if any file start with http return text
if any(x.startswith("http") for x in files.values()):
printe.output(f"<<red>> {ty} {len(files)} x.startswith(http)")
return text
# ---
printe.output(f"<<blue>> {ty} {len(files)}")
# printe.output(files)
# ---
neww = change_names(files)
# ---
if neww:
# ---
new_text = mv_files_change_text(new_text, neww)
# printe.output(json.dumps(neww, indent=2))
# ---
text = new_text
# ---
return text
Loading
Loading