From 00da9a60d801aa57c9c85dc8378bddad91eddeec Mon Sep 17 00:00:00 2001 From: amandine-sahl Date: Wed, 6 Dec 2017 17:10:05 +0100 Subject: [PATCH] =?UTF-8?q?Scripts=20tous=20pourris=20d'import=20de=20m?= =?UTF-8?q?=C3=A9dia=20depuis=20wikimedia=20commons?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../import_wikimedia_commons/functions.py | 94 +++++++++++++++++++ .../import_wikimedia_commons/run_import.py | 41 ++++++++ 2 files changed, 135 insertions(+) create mode 100644 data/scripts/import_wikimedia_commons/functions.py create mode 100644 data/scripts/import_wikimedia_commons/run_import.py diff --git a/data/scripts/import_wikimedia_commons/functions.py b/data/scripts/import_wikimedia_commons/functions.py new file mode 100644 index 00000000..f4bb942a --- /dev/null +++ b/data/scripts/import_wikimedia_commons/functions.py @@ -0,0 +1,94 @@ + +import requests +import psycopg2 + +from SPARQLWrapper import SPARQLWrapper, JSON + + +def getLicence(licences): + licence = [] + if isinstance(licences, dict): + return licences['name'] + else: + for i in licences: + licence.append(i['name']) + return '; '.join(licence) + + +def main(dbconnexion, cd_refs, refreshAtlas=True, simulate=True): + # DbMedia Query + cur = dbconnexion.cursor() + query = """SELECT ?item ?itemLabel ?nomSc ?image ?identifiant_TAXREF WHERE { + ?item wdt:P225 ?nomSc. + ?item wdt:P18 ?image. + ?item wdt:P3186 '%s' + SERVICE wikibase:label { bd:serviceParam wikibase:language "fr" } + } LIMIT 200""" + + sparql = SPARQLWrapper("https://query.wikidata.org/sparql") + + sqlI = """INSERT INTO taxonomie.t_medias + (cd_ref, titre, url,is_public, id_type, auteur, source, licence) + VALUES (%s, '%s', '%s', true, 2, '%s', 'Wikimedia Commons', '%s') + """ + + for cd_ref in cd_refs: + try: + print("Taxon %s" % cd_ref[0]) + sparql.setQuery(query % cd_ref[0]) + sparql.setReturnFormat(JSON) + results = sparql.query().convert() + + for result in results["results"]["bindings"]: + if (result['image']['value']): + print ' -- INSERT IMAGE' + from lxml import etree + # Recuperation des donnees sur commons + url = "https://tools.wmflabs.org/magnus-toolserver/commonsapi.php?image=%s" % result['image']['value'].split('Special:FilePath/', 1 )[1] + r = requests.get(url) + import xmltodict + a = xmltodict.parse(r.content) + try: + aut = 'Commons' + if 'author' in a['response']['file']: + if len(a['response']['file']['author']) < 500: + aut = a['response']['file']['author'] + licence = getLicence(a['response']['licenses']['license']) + sql = sqlI % ( + cd_ref[0], + a['response']['file']['name'], + result['image']['value'], + aut, + licence + ) + + if simulate is False: + cur.execute(sql) + dbconnexion.commit() + else: + print sql + except Exception as e: + print(' ERREOR') + print(e) + dbconnexion.rollback() + pass + except Exception as e: + pass + + if simulate is False: + cur.execute(""" + UPDATE taxonomie.t_medias SET id_type = 1 + WHERE id_media IN ( + SELECT max(id_media) + FROM taxonomie.t_medias t + LEFT OUTER JOIN (SELECT cd_ref FROM taxonomie.t_medias WHERE id_type = 1) e + ON t.cd_ref = e.cd_ref + WHERE e.cd_ref IS NULL + GROUP BY t.cd_ref + ); + """) + if refreshAtlas: + cur.execute("REFRESH MATERIALIZED VIEW atlas.vm_medias;") + cur.execute("REFRESH MATERIALIZED VIEW atlas.vm_taxons_plus_observes;") + + dbconnexion.commit() diff --git a/data/scripts/import_wikimedia_commons/run_import.py b/data/scripts/import_wikimedia_commons/run_import.py new file mode 100644 index 00000000..187c8545 --- /dev/null +++ b/data/scripts/import_wikimedia_commons/run_import.py @@ -0,0 +1,41 @@ +# coding: utf8 +import psycopg2 +from functions import main + +import configparser + +from config import SQLALCHEMY_DATABASE_URI + + +''' + Exemple d'utilisation de la fonctionnalité importer média depuis médiawiki + Usage : + - créer un lien symbolique de config.py + pour récupérer les paramètres de connexion à la base + - choisir une requête sql qui récupère la liste des taxons + pour lequels récupérer des médias + - paramétrer la fonction main +''' +try: + conn = psycopg2.connect(SQLALCHEMY_DATABASE_URI) +except Exception as e: + print "Connexion à la base impossible" + +try: + cur = conn.cursor() + sql = """SELECT DISTINCT cd_ref + FROM taxonomie.bib_noms + LEFT OUTER JOIN taxonomie.t_medias USING(cd_ref) + WHERE id_media IS NULL + """ + # sql = """SELECT cd_ref from taxonomie.bib_noms LIMIT 10""" + sql = """SELECT cd_ref from atlas.vm_taxons_plus_observes LIMIT 100""" + cur.execute(sql) + rows = cur.fetchall() +except Exception as e: + print "Problème lors de la récupération de la liste des cd_ref" + +main(conn, rows, False, False) + + +conn.close()