Skip to content

Commit

Permalink
Merge branch 'image-metadata' fixes #104, #105
Browse files Browse the repository at this point in the history
  • Loading branch information
siznax committed Jan 19, 2018
2 parents cea7197 + 9527077 commit 726ae15
Show file tree
Hide file tree
Showing 3 changed files with 141 additions and 42 deletions.
131 changes: 105 additions & 26 deletions tests/imageinfo.py
Original file line number Diff line number Diff line change
@@ -1,35 +1,114 @@
# -*- coding:utf-8 -*-

query = 'https://en.wikipedia.org/w/api.php?action=query&format=json&formatversion=2&iiprop=size|url|timestamp&prop=imageinfo&titles=File%3ADouglas%20adams%20portrait%20cropped.jpg'
query = 'https://en.wikipedia.org/w/api.php?action=query&formatversion=2&iiprop=size|url|timestamp|extmetadata&prop=imageinfo&titles=File%3ADouglas%20adams%20portrait%20cropped.jpg'

response = r'''
{
"continue": {
"iistart": "2010-04-16T22:53:21Z",
"continue": "||"
},
"query": {
"pages": [
{
"ns": 6,
"title": "File:Douglas adams portrait cropped.jpg",
"missing": true,
"known": true,
"imagerepository": "shared",
"imageinfo": [
{
"timestamp": "2010-04-16T22:54:28Z",
"size": 32915,
"width": 333,
"height": 386,
"url": "https://upload.wikimedia.org/wikipedia/commons/c/c0/Douglas_adams_portrait_cropped.jpg",
"descriptionurl": "https://commons.wikimedia.org/wiki/File:Douglas_adams_portrait_cropped.jpg",
"descriptionshorturl": "https://commons.wikimedia.org/w/index.php?curid=10031710"
}
"continue": {
"iistart": "2010-04-16T22:53:21Z",
"continue": "||"
},
"query": {
"pages": [
{
"ns": 6,
"title": "File:Douglas adams portrait cropped.jpg",
"missing": true,
"known": true,
"imagerepository": "shared",
"imageinfo": [
{
"timestamp": "2010-04-16T22:54:28Z",
"size": 32915,
"width": 333,
"height": 386,
"url": "https://upload.wikimedia.org/wikipedia/commons/c/c0/Douglas_adams_portrait_cropped.jpg",
"descriptionurl": "https://commons.wikimedia.org/wiki/File:Douglas_adams_portrait_cropped.jpg",
"descriptionshorturl": "https://commons.wikimedia.org/w/index.php?curid=10031710",
"extmetadata": {
"DateTime": {
"value": "2010-04-16 22:54:28",
"source": "mediawiki-metadata",
"hidden": ""
},
"ObjectName": {
"value": "Douglas adams portrait cropped",
"source": "mediawiki-metadata",
"hidden": ""
},
"CommonsMetadataExtension": {
"value": 1.2,
"source": "extension",
"hidden": ""
},
"Categories": {
"value": "Douglas Adams|Portrait photographs of men|Self-published work|Uploaded with derivativeFX",
"source": "commons-categories",
"hidden": ""
},
"Assessments": {
"value": "",
"source": "commons-categories",
"hidden": ""
},
"ImageDescription": {
"value": "douglas adams inspired \"Hitch hikers guide to the galaxy\" H2G2 <a rel=\"nofollow\" class=\"external text\" href=\"http://www.hughes-photography.eu\">www.hughes-photography.eu</a>",
"source": "commons-desc-page"
},
"DateTimeOriginal": {
"value": "",
"source": "commons-desc-page"
},
"Credit": {
"value": "<ul>\n<li><a href=\"//commons.wikimedia.org/wiki/File:Douglas_adams_portrait.jpg\" title=\"File:Douglas adams portrait.jpg\">Douglas_adams_portrait.jpg</a></li>\n</ul>",
"source": "commons-desc-page",
"hidden": ""
},
"Artist": {
"value": "<ul>\n<li>\n<a href=\"//commons.wikimedia.org/wiki/File:Douglas_adams_portrait.jpg\" title=\"File:Douglas adams portrait.jpg\">Douglas_adams_portrait.jpg</a>: <a rel=\"nofollow\" class=\"external text\" href=\"https://www.flickr.com/people/79664273@N00\">michael hughes</a> from berlin, germany</li>\n<li>derivative work: <a href=\"//commons.wikimedia.org/wiki/User:Beao\" title=\"User:Beao\">Bea</a><b><a href=\"//commons.wikimedia.org/wiki/User_talk:Beao\" title=\"User talk:Beao\">o</a></b>\n</li>\n</ul>",
"source": "commons-desc-page"
},
"LicenseShortName": {
"value": "CC BY-SA 2.0",
"source": "commons-desc-page",
"hidden": ""
},
"UsageTerms": {
"value": "Creative Commons Attribution-Share Alike 2.0",
"source": "commons-desc-page",
"hidden": ""
},
"AttributionRequired": {
"value": "true",
"source": "commons-desc-page",
"hidden": ""
},
"LicenseUrl": {
"value": "https://creativecommons.org/licenses/by-sa/2.0",
"source": "commons-desc-page",
"hidden": ""
},
"Copyrighted": {
"value": "True",
"source": "commons-desc-page",
"hidden": ""
},
"Restrictions": {
"value": "",
"source": "commons-desc-page",
"hidden": ""
},
"License": {
"value": "cc-by-sa-2.0",
"source": "commons-templates",
"hidden": ""
}
}
}
]
}
]
}
]
}
}
}
'''

Expand Down
50 changes: 35 additions & 15 deletions wptools/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,15 +76,18 @@ def __init__(self, *args, **kwargs):
else:
self.show()

def __update_imagedata(self, title, _from, info):
def __insert_image_info(self, title, _from, info):
"""
Update page images with get_imageinfo() data
Insert API image INFO into matching image dict
We make one imageinfo request containing only unique
image['file'] names. We match the API response data to an
image['file'] by API title/file match or API
normalized["from"]/file match. So, some imageinfo data will be
applied to more than one image['kind'].
We make one imageinfo request containing only unique image
filenames. We reduce duplication by asking for image data per
file, instead of per "kind" or source (Wikipedia, Wikidata,
etc.), because some sources reference the same image file. We
match API imageinfo response data to existing image filenames
by API title or normalized "from" title. So, some imageinfo
data will be applied to more than one image "kind" (source) if
they share the same filename.
"""
for img in self.data['image']:
if 'url' not in img:
Expand All @@ -93,6 +96,28 @@ def __update_imagedata(self, title, _from, info):
elif _from == img['file']: # matching from/file
img.update(info)

def __pull_image_info(self, title, imageinfo, normalized):
"""
Pull image INFO from API response and insert
"""
for info in imageinfo:
info.update({'title': title})

# get API normalized "from" filename for matching
_from = None
for norm in normalized:
if title == norm['to']:
_from = norm['from']

# let's put all "metadata" in one member
info['metadata'] = {}
extmetadata = info.get('extmetadata')
if extmetadata:
info['metadata'].update(extmetadata)
del info['extmetadata']

self.__insert_image_info(title, _from, info)

def _missing_imageinfo(self):
"""
returns list of image filenames that are missing info
Expand Down Expand Up @@ -189,14 +214,9 @@ def _set_imageinfo_data(self):

for page in pages:
title = page.get('title')
if page.get('imageinfo'):
for info in page['imageinfo']:
info.update({'title': title})
_from = None # normalized filename
for norm in normalized:
if title == norm['to']:
_from = norm['from']
self.__update_imagedata(title, _from, info)
imageinfo = page.get('imageinfo')
if imageinfo:
self.__pull_image_info(title, imageinfo, normalized)

# Mark missing imageinfo to prevent duplicate requests
for img in self.data['image']:
Expand Down
2 changes: 1 addition & 1 deletion wptools/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ class WPToolsQuery(object):
"${WIKI}/w/api.php?action=query"
"&format=json"
"&formatversion=2"
"&iiprop=size|url|timestamp"
"&iiprop=size|url|timestamp|extmetadata"
"&prop=imageinfo"
"&titles=${FILES}"))

Expand Down

0 comments on commit 726ae15

Please sign in to comment.