diff --git a/README.md b/README.md index a2902ca..6dc21d0 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,140 @@ -# metadump -Metadata dump utility +# Metadump + +Metadump is a command line utility which extracts and displays embedded metadata of various types of files. + +## Features + +* Extracting and displaying metadata of various types of files such as: + - EXIF + - PDF + - XMP (currently works only on a linux machine) + - Microsoft Office-Documents: Word, Excel, Powerpoint (documents in the format before 2007 are not supported) +* Filtering of the extracted metadata +* Easy expandability for other types of files + +## Prerequisites / Installing + +These instructions will get you a copy of the project up and running Metadump on your machine. + +1. Python3 with the package manager pip must be installed - [Download Python](https://www.python.org/downloads/) + +2. Clone the project or download it as .zip and extract it + +3. Install the dependencies of the project by typing `pip3 install -r requirements.txt`. + +Now Metadump is ready for use. + +## Usage + +Execute the following command to display the help text: + + python3 metadump.py -h + +Metadump can be run with the following parameters: + + + -i or --input Path to the file or directory which should be analysed + + -f or --filters Select categories to be displayed + + --filteroptions Displays the options for the filtering parameter + + --version Displays the version of Metadump + + -v / -vv / -vvv Defines the level of verbosity. + + -l or --limit Defines the maximum length of the output values (longer values will be cut off) + + -o or --order Order key-value pairs by their category + + -r or --recursive If the input is a path to a directory, use this flag to let metadump recursively select all files in this directory and its subdirectories + + -c or --printcategories Display categories of each extracted metadata key-value pair + + -s or --stream Stream the results (the default configuration is that results are displayed only after all files have been analysed). + + --showplugins Displays all loaded plugins + + -p or --plugins Specify plugins to be used, other file extensions will be ignored (useful if you want to analyse only a specific type of files in a directory). + + --showemptyfiles Include names of files for which no metadata could be extracted (the default is false) + + +## Examples + +### Example 1: Extracting metadata from a picture with default settings + +``` +python3 metadump.py -i ~/Downloads/picture.jpg +``` +|KEY | VALUE | DESCRIPTION | +|-----------------------| -------------------------------- | ----------------------------------------- | +|Make | Apple | embedded EXIF metadata +|Model | iPod touch | embedded EXIF metadata +|Software | Microsoft Windows Photo Viewer | embedded EXIF metadata +|DateTime | 2012:08:01 14:11:10 | embedded EXIF metadata +|DateTimeOriginal | 2012:05:30 21:23:13 | embedded EXIF metadata +|DateTimeDigitized | 2012:05:30 21:23:13 | embedded EXIF metadata +|GPSInfo => Latitude | 74.87166666666667 N | extracted from EXIF GPSInfo metadata +|GPSInfo => Longitude | 12.3913888888889 W | extracted from EXIF GPSInfo metadata +|GPSInfo => Altitude | 120.16254416961131 meter | extracted from EXIF GPSInfo metadata + +### Example 2: Extracting metadata from a picture with different parameters + +``` +python3 metadump.py -i ~/Downloads/picture.jpg -vvv -o -p EXIF -f time location +``` + +#### CATEGORY: TIME +|KEY | VALUE | DESCRIPTION | +|--- | --- | ---- | +|GPSInfo:GPSTimeStamp | 2012:05:30 21:23:13 | extracted from EXIF GPSInfo metadata +|DateTimeOriginal | 2012:05:30 21:23:13 | embedded EXIF metadata +|DateTimeDigitized | 2012:05:30 21:23:13 | embedded EXIF metadata +|DateTime | 2012:08:01 14:11:10 | embedded EXIF metadata + +#### CATEGORY: LOCATION +|KEY | VALUE | DESCRIPTION | +|--- | --- | ---- | +|GPSInfo | {1: 'N', 2: ((77, 1), (5218, 1 | embedded EXIF metadata +|GPSInfo:GPSLatitudeRef | N | extracted from EXIF GPSInfo metadata +|GPSInfo:GPSLatitude | ((77, 1), (5218, 100), (0, 1)) | extracted from EXIF GPSInfo metadata +|GPSInfo:GPSLongitudeRef | W | extracted from EXIF GPSInfo metadata +|GPSInfo:GPSLongitude | ((12, 1), (2329, 100), (0, 1)) | extracted from EXIF GPSInfo metadata +|GPSInfo:GPSAltitudeRef | Above sea level | extracted from EXIF GPSInfo metadata +|GPSInfo:GPSAltitude | (34006, 283) | extracted from EXIF GPSInfo metadata +|GPSInfo => Altitude | 120.16254416961131 meter | extracted from EXIF GPSInfo metadata +|GPSInfo => Latitude | 74.87166666666667 N | extracted from EXIF GPSInfo metadata +|GPSInfo => Longitude | 12.3913888888889 W | extracted from EXIF GPSInfo metadata + + +### Example 3: Extracting metadata from a word document +``` +python3 metadump.py -i ~/Downloads/document.docx -vvv +``` +|KEY | VALUE | DESCRIPTION | +|-----------------------| ----------- | ---- | +|title | Bill 2018 | Microsoft Office - docProps/core.xml +|creator | Phantasy Company - Accounting Dept. | Microsoft Office - docProps/core.xml +|keywords | Phantasy Company | Microsoft Office - docProps/core.xml +|description | Phantasy Company - Bill 2018 | Microsoft Office - docProps/core.xml +|lastModifiedBy | John Doe - Phantasy Company | Microsoft Office - docProps/core.xml +|revision | 7 | Microsoft Office - docProps/core.xml +|lastPrinted | 2016-02-24T13:43:00Z | Microsoft Office - docProps/core.xml +|created | 2016-02-26T10:36:00Z | Microsoft Office - docProps/core.xml +|modified | 2018-09-20T14:04:00Z | Microsoft Office - docProps/core.xml +|Template | Normal | Microsoft Office - docProps/app.xml +|TotalTime | 0 | Microsoft Office - docProps/app.xml +|Pages | 1 | Microsoft Office - docProps/app.xml +|Words | 59 | Microsoft Office - docProps/app.xml +|Characters | 378 | Microsoft Office - docProps/app.xml +|Application | Microsoft Office Word | Microsoft Office - docProps/app.xml +|DocSecurity | 0 | Microsoft Office - docProps/app.xml +|Lines | 3 | Microsoft Office - docProps/app.xml +|Paragraphs | 1 | Microsoft Office - docProps/app.xml +|ScaleCrop | false | Microsoft Office - docProps/app.xml +|LinksUpToDate | false | Microsoft Office - docProps/app.xml +|CharactersWithSpaces | 436 | Microsoft Office - docProps/app.xml +|SharedDoc | false | Microsoft Office - docProps/app.xml +|HyperlinksChanged | false | Microsoft Office - docProps/app.xml +|AppVersion | 15.0000 | Microsoft Office - docProps/app.xml \ No newline at end of file diff --git a/metadump.py b/metadump.py new file mode 100644 index 0000000..02d4952 --- /dev/null +++ b/metadump.py @@ -0,0 +1,423 @@ +#!/usr/bin/env python + +__description__ = 'Metadata dump utility' +__author__ = 'Florian Wahl' +__version__ = '1.0.0' +__date__ = '2019/03/17' + + +import os +import importlib.util +from datetime import datetime +import argparse +import sys + +BANNER_TEXT = """ __ ___ __ __ + / |/ /__ / /_____ _____/ /_ ______ ___ ____ + / /|_/ / _ \/ __/ __ `/ __ / / / / __ `__ \/ __ \\ + / / / / __/ /_/ /_/ / /_/ / /_/ / / / / / / /_/ / +/_/ /_/\___/\__/\__,_/\__,_/\__,_/_/ /_/ /_/ .___/ +by Florian Wahl /_/ + +""" + +VERSION_INFO = """version: {0} from {1} +developed by: {2} +""".format(__version__, __date__, __author__) + +FILTER_OPTIONS = """You can pass names of categories after the --filter option. Then only metadata from these categories will be printed out. +The categories are structured in a tree: + +|--time +| |-- creation_time +| |-- modify_time +| +|-- author +| |-- author_name +| |-- comment +| +|-- tool +| |-- hardware +| |-- software +| +|--location + |-- position_latitude + |-- position_longitude + + +Examples: +'python3 metadump.py -i INPUT -f time' Shows only timestamps +'python3 metadump.py -i INPUT -f location author_name' Shows only information about the location and the name of the author + +""" + +MAIN_CATEGORIES = ['time', 'author', 'tool', 'location'] + +UNICODE_SUPPORT = sys.stdout.encoding.lower().startswith('utf') + +###################################################################################### +# helper classes and functions +class VAction(argparse.Action): + """ + For parsing the verbosity level + """ + def __init__(self, option_strings, dest, nargs=None, const=None, default=None, type=None, choices=None, required=False, help=None, metavar=None): + super(VAction, self).__init__(option_strings, dest, nargs, const, default, type, choices, required, help, metavar) + self.values = 0 + + def __call__(self, parser, args, values, option_string=None): + if values is None: + self.values += 1 + else: + try: + self.values = int(values) + except ValueError: + self.values = values.count('v')+1 + setattr(args, self.dest, self.values) + + +def _filter_for_category(metadata, categories: list): + filtered_metadata = list() + for key, value, description, category, vlevel in metadata: + filter_passed = False + for cat in category: + if cat in categories: + filter_passed = True + break + if filter_passed: + filtered_metadata.append((key, value, description, category, vlevel)) + return filtered_metadata + + +def _parse_date_string(date_string): + datetime_formats = ['%Y:%m:%d %H:%M:%S', '%Y-%m-%dT%H:%M:%S', '%Y-%m-%dT%H:%M:%SZ'] + for datetime_format in datetime_formats: + try: + return datetime.strptime(date_string, datetime_format) + except ValueError: + pass + return None + + +def _convert_gps_specification(gps_coordinate: str): + reference_direction = _extract_gps_reference_direction(gps_coordinate=gps_coordinate) + if reference_direction == None: + print('reference_direction') + return None + gps_coordinate = gps_coordinate.replace(reference_direction, '').strip() + if ',' in gps_coordinate and '.' in gps_coordinate: # in degree format + try: + degree = int(gps_coordinate.split(',')[0]) + minutes = int(gps_coordinate.split(',')[1].split('.')[0]) + seconds = int(gps_coordinate.split(',')[1].split('.')[1]) + gps_coordinate = degree + (minutes / 60.0) + (seconds / 3600.0) + except Exception as e: + print(e) + return None + else: # in decimal format + try: + gps_coordinate = float(gps_coordinate) + except: + None + return gps_coordinate, reference_direction + + +def _extract_gps_reference_direction(gps_coordinate: str): + directions = ['N', 'E', 'W', 'S'] + for direction in directions: + if direction in gps_coordinate: + return direction + + +###################################################################################### +# load plugins dynamically +def _load_plugins(): + path_to_plugins = os.path.dirname(os.path.realpath(__file__)) + os.sep + 'plugins' + plugin_files = list() + for file_path in os.listdir(path_to_plugins): + if os.path.isfile(path_to_plugins + os.sep + file_path) and file_path.endswith('.py'): + plugin_files.append(path_to_plugins + os.sep + file_path) + + plugins = list() + for counter, plugin_file in enumerate(plugin_files): + specification = importlib.util.spec_from_file_location(name='plugin_{}'.format(counter), location=plugin_file) + module = importlib.util.module_from_spec(specification) + specification.loader.exec_module(module) + plugins.append(module.ANALYSER()) + return plugins + +PLUGINS = _load_plugins() + + +###################################################################################### +# functions which can be imported by other scripts + +def extract_metadata_of_file(path_to_file, specified_plugins=None): + """ + extracts all metadata of a file using the specified plugins + :param path_to_file: path to the file which should be parsed + """ + metadata = list() + for plugin in PLUGINS: + if specified_plugins != None and plugin.name() not in specified_plugins: + continue + metadata += plugin.extract_metadata(path_to_file) + return metadata + + +def get_creation_date(metadata: list): + filtered_metadata = _filter_for_category(metadata=metadata, categories=['creation_time']) + metadata_values = [x[1] for x in filtered_metadata] + parsed_dates = [_parse_date_string(x) for x in metadata_values] + parsed_dates = [x for x in parsed_dates if x is not None] + if len(parsed_dates) == 0: + return None + return parsed_dates.pop() + + +def get_modify_date(metadata: list): + filtered_metadata = _filter_for_category(metadata=metadata, categories=['modify_time']) + metadata_values = [x[1] for x in filtered_metadata] + parsed_dates = [_parse_date_string(x) for x in metadata_values] + parsed_dates = [x for x in parsed_dates if x is not None] + if len(parsed_dates) == 0: + return None + return parsed_dates.pop() + + +def get_GPS_coordinates(metadata: list): + latitude_filtered = _filter_for_category(metadata=metadata, categories=['position_latitude']) + longitude_filtered = _filter_for_category(metadata=metadata, categories=['position_longitude']) + latitude_values = [x[1] for x in latitude_filtered if x[1] != ''] + longitude_values = [x[1] for x in longitude_filtered if x[1] != ''] + converted_lat = [_convert_gps_specification(x) for x in latitude_values] + converted_lon = [_convert_gps_specification(x) for x in longitude_values] + converted_lat = [x for x in converted_lat if x is not None] + converted_lon = [x for x in converted_lon if x is not None] + if len(converted_lat) == 0 or len(converted_lon) == 0: + return None + lat, lat_ref = converted_lat.pop() + lon, lon_ref = converted_lon.pop() + return lat, lat_ref, lon, lon_ref + + +def get_author_name(metadata: list): + filtered_metadata = _filter_for_category(metadata=metadata, categories=['author_name']) + metadata_values = [x[1] for x in filtered_metadata if x[1] != ''] + if len(metadata_values) == 0: + return None + return '; '.join(metadata_values) + + +###################################################################################### +# main program +def _progress_bar(iteration, total, prefix='', suffix='', decimals=1, length=70): + if UNICODE_SUPPORT: + fill = '█' + else: + fill = '#' + percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total))) + filledLength = int(length * iteration // total) + bar = fill * filledLength + '-' * (length - filledLength) + print('\r%s |%s| %s%% %s' % (prefix, bar, percent, suffix), end = '\r') + # Print New Line when its complete + if iteration == total: + print() + + +def _extract_metadata_of_list_of_files(file_paths: list, path_to_input, arguments): + extracted = list() + total_number_of_files = len(file_paths) + _progress_bar(iteration=0, total=total_number_of_files, prefix='Analysing Files:', suffix='', decimals=2) + for index, path_to_file in enumerate(file_paths): + metadata = extract_metadata_of_file(path_to_file=path_to_file, specified_plugins=arguments.plugins) + extracted.append((path_to_file, metadata)) + _progress_bar(iteration=index+1, total=total_number_of_files, prefix='Analysing Files:', suffix='', decimals=2) + print() + return extracted + + +def _preprocess_extracted_metadata(arguments, metadata): + # filter verbosity level + filtered_metadata = list() + for key, value, description, category, vlevel in metadata: + value = str(value) + if vlevel > arguments.verbose: + continue + if arguments.verbose < 3: + if value == '': + continue + filtered_metadata.append((str(key), value, description, category, vlevel)) + metadata = filtered_metadata + + # applies filter + if arguments.filter != None: + metadata = _filter_for_category(metadata=metadata, categories=arguments.filter) + + # applies value length limit + if arguments.limit is not None: + metadata = [(key, value[:arguments.limit], description, category, vlevel) for (key, value, description, category, vlevel) in metadata] + + return metadata + + +def _display_result(arguments, metadata_of_files, path_to_input, display_part_of_stream=False): + # filter empty files + if not arguments.showemptyfiles: + metadata_of_files = [(file_path, metadata) for file_path, metadata in metadata_of_files if len(metadata) > 0] + + # shorten the file paths + metadata_of_files = [(file_path.replace(path_to_input, ''), metadata) for file_path, metadata in metadata_of_files] + + if len(metadata_of_files) == 0 and not display_part_of_stream: + print('No metadata found') + return None + + # printing parameters + key_max = 0 + value_max = 0 + description_max = 0 + for _, data in metadata_of_files: + for key, value, description, _, _ in data: + key_max = max(key_max, len(key)) + value_max = max(value_max, len(value)) + description_max = max(description_max, len(description)) + + for file_name, metadata in metadata_of_files: + print(100*'=') + print('File: {}'.format(file_name)) + if len(metadata) == 0: + print('\tNo metadata found\n') + continue + + if arguments.order: + metadata = sorted(metadata, key=lambda x: x[3]) + categories = MAIN_CATEGORIES + ['other'] + sorted_metadata = {x: list() for x in categories} + for key, value, description, category, vlevel in metadata: + found = False + for cat in category: + if cat in sorted_metadata: + sorted_metadata[cat].append((key, value, description, category, vlevel)) + found = True + if not found: + sorted_metadata['other'].append((key, value, description, category, vlevel)) + for cat in categories: + data = sorted_metadata[cat] + if len(data) == 0: + continue + print('\tCATEGORY: {0}'.format(cat.upper())) + + _print_meta_data(key_max=key_max, value_max=value_max, description_max=description_max, indentation=2, metadata=data, arguments=arguments) + print() + else: + _print_meta_data(key_max=key_max, value_max=value_max, description_max=description_max, indentation=1, metadata=metadata, arguments=arguments) + print() + + +def _print_meta_data(key_max, value_max, description_max, indentation, metadata, arguments): + spacing = 3 + print() + if arguments.printcategories: + print(indentation*'\t', 'KEY'.ljust(key_max + spacing) + '| ' + 'VALUE'.ljust(value_max + spacing) + '| ' + 'DESCRIPTION'.ljust(description_max + spacing) + '| CATEGORIES') + print(indentation*'\t', (key_max + value_max + description_max+29)*'-') + else: + print(indentation*'\t', 'KEY'.ljust(key_max + spacing) + '| ' + 'VALUE'.ljust(value_max + spacing) + '| ' + 'DESCRIPTION') + print(indentation*'\t', (key_max + value_max + description_max+9)*'-') + for key, value, description, category, _ in metadata: + key = str(key) + value = str(value) + description = str(description) + try: + if arguments.printcategories: + cat = ', '.join(category) + print(indentation*'\t', key.ljust(key_max + spacing) + '| ' + value.ljust(value_max + spacing) + '| ' + description.ljust(description_max + spacing) + '| ' + cat) + else: + print(indentation*'\t', key.ljust(key_max + spacing) + '| ' + value.ljust(value_max + spacing) + '| ' + description) + except: + print(indentation*'\t', ' ---- Encoding ERROR ----- ') + + +if __name__ == '__main__': + print(BANNER_TEXT) + parser = argparse.ArgumentParser() + parser.add_argument('-i', '--input', help="Path to the file or directory which should be analysed", type=str) + parser.add_argument('-f', '--filter', nargs='+', default=None, help="Filter metadata for special kategory") + parser.add_argument('--filteroptions', action='store_true', help="Shows options for filtering extracted metadata") + parser.add_argument('--version', action='store_true', help="Shows version of metadump") + parser.add_argument('-v', nargs='?', action=VAction, default=1, dest='verbose', help="Defines verbosity level [-v, -vv, -vvv, -vvv]") + parser.add_argument('-l', '--limit', type=int, default=30, help="Maximal characters of metadata value") + parser.add_argument('-o', '--order', action='store_true', default=False, help="Order key-value pairs by their category") + parser.add_argument('-r', '--recursive', action='store_true', default=False, help="If the input is a path to a directory this flag will metadump seleact all files in this directory and subdirectories recursive") + parser.add_argument('-c', '--printcategories', action='store_true', default=False, help="Will print the categories of the extracted metadata") + parser.add_argument('-s', '--stream', action='store_true', default=False, help="When a file is analysed the results will be printed directly") + parser.add_argument('--showplugins', action='store_true', default=False, help="Prints all loaded plugins") + parser.add_argument('-p', '--plugins', nargs='+', default=None, help="Only use the specified Plugins") + parser.add_argument('--showemptyfiles', action='store_true', default=False, help="Prints a file although no metadata could be extracted") + + arguments = parser.parse_args() + + if arguments.filteroptions: + print(FILTER_OPTIONS) + exit() + + if arguments.version: + print(VERSION_INFO) + exit() + + if arguments.showplugins: + print('Loaded Plugins:') + for plugin in PLUGINS: + print('\t', plugin.name()) + exit() + + # check that input is valid + if arguments.input == None: + print('ERROR: Path to input must be set\n') + parser.print_help() + exit() + + path_to_input = os.path.abspath(arguments.input) + + if not os.path.exists(path_to_input): + print('ERROR: Path "{}" does not exist\n'.format(path_to_input)) + exit() + + # gather absolute paths to the files which should be scanned + input_list = list() + if os.path.isdir(path_to_input): + directory_queue = [path_to_input] + while len(directory_queue) > 0: + path_to_dir = directory_queue.pop(0) + for file_path in os.listdir(path_to_dir): + file_path = os.path.join(path_to_dir, file_path) + if os.path.isdir(file_path): + if arguments.recursive: + directory_queue.append(file_path) + else: + input_list.append(file_path) + else: + input_list.append(path_to_input) + + if len(input_list) == 0: + print('no files found') + exit() + + try: + if arguments.stream: + for path_to_file in input_list: + extract_metadata = extract_metadata_of_file(path_to_file=path_to_file, specified_plugins=arguments.plugins) + metadata_of_files = [(path_to_file, extract_metadata)] + # preprocess the extracted metadata + metadata_of_files = [ (path_to_file, _preprocess_extracted_metadata(arguments=arguments, metadata=metadata)) for path_to_file, metadata in metadata_of_files] + # display metadata + _display_result(arguments, metadata_of_files=metadata_of_files, path_to_input=path_to_input, display_part_of_stream=True) + else: + metadata_of_files = _extract_metadata_of_list_of_files(file_paths=input_list, path_to_input=path_to_input, arguments=arguments) + # preprocess the extracted metadata + metadata_of_files = [ (path_to_file, _preprocess_extracted_metadata(arguments=arguments, metadata=metadata)) for path_to_file, metadata in metadata_of_files] + # display metadata + _display_result(arguments, metadata_of_files=metadata_of_files, path_to_input=path_to_input) + except KeyboardInterrupt: + print() + print('Keyboard Interrupt: Stopping search') diff --git a/plugins/EXIF.py b/plugins/EXIF.py new file mode 100644 index 0000000..eb94a10 --- /dev/null +++ b/plugins/EXIF.py @@ -0,0 +1,153 @@ +import sys +from PIL import Image, ExifTags +from PIL.ExifTags import TAGS + + +KEY_TO_CATEGORIES = dict() +KEY_TO_CATEGORIES['DateTimeOriginal'] = (['time', 'creation_time'], 1) +KEY_TO_CATEGORIES['DateTimeDigitized'] = (['time', 'creation_time'], 1) +KEY_TO_CATEGORIES['CreateDate'] = (['time', 'creation_time'], 1) +KEY_TO_CATEGORIES['DateTime'] = (['time', 'modify_time'], 1) + +KEY_TO_CATEGORIES['CameraOwnerName'] = (['author', 'author_name'], 1) +KEY_TO_CATEGORIES['OwnerName'] = (['author', 'author_name'], 1) +KEY_TO_CATEGORIES['XPAuthor'] = (['author'], 1) +KEY_TO_CATEGORIES['Copyright'] = (['author'], 1) +KEY_TO_CATEGORIES['Artist'] = (['author'], 1) +KEY_TO_CATEGORIES['UserComment'] = (['author', 'comment'], 1) +KEY_TO_CATEGORIES['ImageDescription'] = (['author', 'comment'], 1) + +KEY_TO_CATEGORIES['Make'] = (['tool', 'hardware'], 1) +KEY_TO_CATEGORIES['Model'] = (['tool', 'hardware'], 1) +KEY_TO_CATEGORIES['SerialNumber'] = (['tool', 'hardware'], 1) +KEY_TO_CATEGORIES['BodySerialNumber'] = (['tool'], 1) +KEY_TO_CATEGORIES['CameraSerialNumber'] = (['tool', 'hardware'], 1) +KEY_TO_CATEGORIES['Software'] = (['tool', 'software'], 1) + +KEY_TO_CATEGORIES['GPSInfo'] = (['location'], 2) +KEY_TO_CATEGORIES['GPSInfo:GPSLatitudeRef'] = (['location'], 2) +KEY_TO_CATEGORIES['GPSInfo:GPSLatitude'] = (['location'], 2) +KEY_TO_CATEGORIES['GPSInfo:GPSLongitudeRef'] = (['location'], 2) +KEY_TO_CATEGORIES['GPSInfo:GPSLongitude'] = (['location'], 2) +KEY_TO_CATEGORIES['GPSInfo:GPSAltitudeRef'] = (['location'], 2) +KEY_TO_CATEGORIES['GPSInfo:GPSAltitude'] = (['location'], 2) +KEY_TO_CATEGORIES['GPSInfo:GPSTimeStamp'] = (['time'], 2) +KEY_TO_CATEGORIES['GPSInfo:Latitude'] = (['location'], 2) +KEY_TO_CATEGORIES['GPSInfo:Longitude'] = (['location'], 2) + +KEY_TO_CATEGORIES['GPSInfo => Latitude'] = (['location', 'position_latitude'], 1) +KEY_TO_CATEGORIES['GPSInfo => Longitude'] = (['location', 'position_longitude'], 1) +KEY_TO_CATEGORIES['GPSInfo => Altitude'] = (['location'], 1) + + + +class Exif_Analyser: + def name(self): + return 'EXIF' + + def extract_metadata(self, path_to_file: str) -> dict: + metadata = self._extract_metadata(path_to_file=path_to_file) + metadata = self._enrich_with_GPS_Information(metadata=metadata) + metadata = [(key, metadata[key][0], metadata[key][1]) for key in metadata] + metadata = self._enrich_with_categories(metadata=metadata) + return metadata + + def _enrich_with_categories(self, metadata: dict) -> dict: + enriched_metadata = list() + for key, value, describtion in metadata: + category, vlevel = KEY_TO_CATEGORIES.get(key, (list(), 3)) + enriched_metadata.append((key, value, describtion, category, vlevel)) + return enriched_metadata + + def _extract_metadata(self, path_to_file): + try: + image_file = Image.open(path_to_file, mode='r') + try: + metadata = image_file._getexif() + metadata = self._decode_metadata(metadata=metadata) + return metadata + except AttributeError: + return list() + except OSError: + return list() + + def _decode_metadata(self, metadata: dict) -> dict: + decoded = dict() + for (tag, value) in metadata.items(): + decoded_key = TAGS.get(tag, tag) + if type(value) is bytes: + try: + decoded_value = value.decode('utf-8', 'ignore') + except: + decoded_value = '[DECODING ERROR]' + else: + decoded_value = value + decoded[decoded_key] = (decoded_value, 'embedded EXIF metadata') + return decoded + + ############################################################################################## + # GPS + + def _enrich_with_GPS_Information(self, metadata: dict): + if 'GPSInfo' in metadata: + gps_info = self._decode_GPSInfo(gps_info_encoded=metadata['GPSInfo'][0]) + for key, value in gps_info.items(): + metadata['GPSInfo:{0}'.format(key)] = (value, 'extracted from EXIF GPSInfo metadata') + if self._gps_info_contains_GPS_position(decoded_gps_info=gps_info): + lat, lat_ref, lon, lon_ref = self._decode_GPS_position(decoded_gps_info=gps_info) + metadata['GPSInfo => Latitude'] = ('{0} {1}'.format(lat, lat_ref), 'extracted from EXIF GPSInfo metadata') + metadata['GPSInfo => Longitude'] = ('{0} {1}'.format(lon, lon_ref), 'extracted from EXIF GPSInfo metadata') + if self._gps_info_contains_altitude(decoded_gps_info=gps_info): + altitude, alt_ref = self._decode_GPS_altitude(decoded_gps_info=gps_info) + metadata['GPSInfo => Altitude'] = ('{0} meter'.format(altitude), 'extracted from EXIF GPSInfo metadata') + metadata['GPSInfo:GPSAltitudeRef'] = ('{0}'.format(alt_ref), 'extracted from EXIF GPSInfo metadata') + return metadata + + def _decode_GPSInfo(self, gps_info_encoded: dict): + gps_info = dict() + for key in gps_info_encoded.keys(): + decoded_key = ExifTags.GPSTAGS.get(key, key) + gps_info[decoded_key] = gps_info_encoded[key] + return gps_info + + def _gps_info_contains_GPS_position(self, decoded_gps_info: dict): + return 'GPSLongitude' in decoded_gps_info and 'GPSLatitude' in decoded_gps_info and 'GPSLatitudeRef' in decoded_gps_info and 'GPSLongitudeRef' in decoded_gps_info + + def _decode_GPS_position(self, decoded_gps_info: dict): + try: + lat = self._convert_degree_to_decimal(decoded_gps_info['GPSLatitude']) + lat_ref = decoded_gps_info['GPSLatitudeRef'] + lon = self._convert_degree_to_decimal(decoded_gps_info['GPSLongitude']) + lon_ref = decoded_gps_info['GPSLongitudeRef'] + return lat, lat_ref, lon, lon_ref + except: + return None + + def _gps_info_contains_altitude(self, decoded_gps_info: dict): + return 'GPSAltitude' in decoded_gps_info and 'GPSAltitudeRef' in decoded_gps_info + + def _decode_GPS_altitude(self, decoded_gps_info: dict): + altitude = float(decoded_gps_info['GPSAltitude'][0]) / float(decoded_gps_info['GPSAltitude'][1]) + if decoded_gps_info['GPSAltitudeRef'] == b'\x00': + alt_ref = 'Above sea level' + modifier = 1 + elif decoded_gps_info['GPSAltitudeRef'] == b'\x01': + alt_ref = 'Below sea level' + modifier = -1 + return altitude * modifier, alt_ref + + def _convert_degree_to_decimal(self, value): + degree = float(value[0][0]) / float(value[0][1]) + minutes_reference = float(value[1][1]) + seconds_reference = float(value[2][1]) + if minutes_reference == 1: + minutes = float(value[1][0]) / minutes_reference + seconds = float(value[2][0]) / seconds_reference + else: + seconds = (float(value[1][0]) / minutes_reference) % 1 + minutes = (float(value[1][0]) / minutes_reference) - seconds + seconds *= 100 + return degree + (minutes / 60.0) + (seconds / 3600.0) + + +ANALYSER = Exif_Analyser diff --git a/plugins/Office.py b/plugins/Office.py new file mode 100644 index 0000000..9f7073f --- /dev/null +++ b/plugins/Office.py @@ -0,0 +1,85 @@ +import zipfile +import lxml.etree +import sys + + +KEY_TO_CATEGORIES = dict() +KEY_TO_CATEGORIES['lastPrinted'] = (['time'], 1) +KEY_TO_CATEGORIES['created'] = (['time', 'creation_time'], 1) +KEY_TO_CATEGORIES['modified'] = (['time', 'modify_time'], 1) + +KEY_TO_CATEGORIES['Application'] = (['tool', 'software'], 1) +KEY_TO_CATEGORIES['AppVersion'] = (['tool', 'software'], 1) + +KEY_TO_CATEGORIES['lastModifiedBy'] = (['author', 'author_name'], 1) +KEY_TO_CATEGORIES['creator'] = (['author', 'author_name'], 1) +KEY_TO_CATEGORIES['Company'] = (['author'], 1) + + +class Office_Analyser: + def name(self): + return 'Office' + + def extract_metadata(self, path_to_file: str) -> dict: + metadata = self._extract_metadata(path_to_file=path_to_file) + metadata = self._enrich_with_categories(metadata=metadata) + return metadata + + def _enrich_with_categories(self, metadata: dict) -> dict: + enriched_metadata = list() + for key, value, describtion in metadata: + category, vlevel = KEY_TO_CATEGORIES.get(key, (list(), 3)) + enriched_metadata.append((key, value, describtion, category, vlevel)) + return enriched_metadata + + def _extract_metadata(self, path_to_file): + if zipfile.is_zipfile(path_to_file): + try: + zip_file = zipfile.ZipFile(path_to_file) + except zipfile.BadZipFile: + return list() + meta_data_from_core = self._meta_data_from_core(zip_file=zip_file) + meta_data_from_app = self._meta_data_from_app(zip_file=zip_file) + return meta_data_from_core + meta_data_from_app + return list() + + def _meta_data_from_core(self, zip_file): + meta_data = list() + try: + meta_data_as_xml = lxml.etree.fromstring(zip_file.read('docProps/core.xml')) + except KeyError: + return meta_data # No file docProps/core.xml + + for child in meta_data_as_xml.iterchildren(): + if child.text == None: + text = '' + else: + text = child.text + tag = Office_Analyser._get_purified_tag(child) + meta_data.append((tag, text, 'Microsoft Office - docProps/core.xml')) + return meta_data + + def _meta_data_from_app(self, zip_file): + meta_data = list() + try: + meta_data_as_xml = lxml.etree.fromstring(zip_file.read('docProps/app.xml')) + except KeyError: + return meta_data # No file docProps/app.xml + + for child in meta_data_as_xml.iterchildren(): + if child.text == None: + text = '' + else: + text = child.text + tag = Office_Analyser._get_purified_tag(child) + meta_data.append((tag, text, 'Microsoft Office - docProps/app.xml')) + return meta_data + + @staticmethod + def _get_purified_tag(element): + tag = element.tag + tag = tag.replace('{' + str(element.nsmap[element.prefix]) + '}', '') + return tag + + +ANALYSER = Office_Analyser \ No newline at end of file diff --git a/plugins/PDF.py b/plugins/PDF.py new file mode 100644 index 0000000..bda80cd --- /dev/null +++ b/plugins/PDF.py @@ -0,0 +1,41 @@ +import PyPDF2 +import sys + + +KEY_TO_CATEGORIES = dict() +KEY_TO_CATEGORIES['/Author'] = (['author', 'author_name'], 1) +KEY_TO_CATEGORIES['/Producer'] = (['author', 'author_name', 'tool', 'software'], 1) +KEY_TO_CATEGORIES['/CreationDate'] = (['time', 'creation_time'], 1) +KEY_TO_CATEGORIES['/ModDate'] = (['time', 'modify_time'], 1) + + +class PDF_Analyser: + def name(self): + return 'PDF' + + def extract_metadata(self, path_to_file: str) -> dict: + metadata = self._extract_metadata(path_to_pdf=path_to_file) + metadata = self._enrich_with_categories(metadata=metadata) + return metadata + + def _enrich_with_categories(self, metadata: dict) -> dict: + enriched_metadata = list() + for key, value, describtion in metadata: + category, vlevel = KEY_TO_CATEGORIES.get(key, (list(), 3)) + enriched_metadata.append((key, value, describtion, category, vlevel)) + return enriched_metadata + + def _extract_metadata(self, path_to_pdf: str) -> dict: + try: + with open(path_to_pdf, mode='rb') as file_stream: + pdf_file = PyPDF2.PdfFileReader(file_stream, strict=False) + + meta_data = pdf_file.getDocumentInfo() + return [(key, meta_data[key], 'embedded PDF metadata') for key in meta_data] + except PyPDF2.utils.PdfReadError: + return list() + except OSError: + return list() + +ANALYSER = PDF_Analyser + \ No newline at end of file diff --git a/plugins/XMP.py b/plugins/XMP.py new file mode 100644 index 0000000..34b387e --- /dev/null +++ b/plugins/XMP.py @@ -0,0 +1,90 @@ +import os +import sys + +# Does not work on windows machine +if os.name != 'nt': + from libxmp.utils import file_to_dict + + +KEY_TO_CATEGORIES = dict() + +# XMP tags +KEY_TO_CATEGORIES['xmp:CreateDate'] = (['time', 'creation_time'], 1) +KEY_TO_CATEGORIES['xmp:MetadataDate'] = (['time'], 1) +KEY_TO_CATEGORIES['xmp:ModifyDate'] = (['time', 'modify_time'], 1) +KEY_TO_CATEGORIES['xmp:CreatorTool'] = (['tool'], 1) +KEY_TO_CATEGORIES['xmp:Author'] = (['author', 'author_name'], 1) +KEY_TO_CATEGORIES['xmp:Nickname'] = (['author'], 1) + +# TIFF tags +KEY_TO_CATEGORIES['tiff:Make'] = (['tool', 'hardware'], 1) +KEY_TO_CATEGORIES['tiff:Model'] = (['tool', 'hardware'], 1) +KEY_TO_CATEGORIES['tiff:Artist'] = (['author'], 1) +KEY_TO_CATEGORIES['tiff:DateTime'] = (['time'], 1) +KEY_TO_CATEGORIES['tiff:Software'] = (['tool', 'software'], 1) + +# EXIF tags +KEY_TO_CATEGORIES['exif:DateTimeDigitized'] = (['time', 'creation_time'], 1) +KEY_TO_CATEGORIES['exif:DateTimeOriginal'] = (['time', 'creation_time'], 1) +KEY_TO_CATEGORIES['exif:UserComment'] = (['author'], 1) +KEY_TO_CATEGORIES['exif:GPSAltitude'] = (['location'], 1) +KEY_TO_CATEGORIES['exif:GPSAltitudeRef'] = (['location'], 1) +KEY_TO_CATEGORIES['exif:GPSAreaInformation'] = (['location'], 1) +KEY_TO_CATEGORIES['exif:GPSDestBearing'] = (['location'], 1) +KEY_TO_CATEGORIES['exif:GPSDestBearingRef'] = (['location'], 1) +KEY_TO_CATEGORIES['exif:GPSDestDistance'] = (['location'], 1) +KEY_TO_CATEGORIES['exif:GPSDestDistanceRef'] = (['location'], 1) +KEY_TO_CATEGORIES['exif:GPSDestLatitude'] = (['location'], 1) +KEY_TO_CATEGORIES['exif:GPSDestLongitude'] = (['location'], 1) +KEY_TO_CATEGORIES['exif:GPSDifferential'] = (['location'], 1) +KEY_TO_CATEGORIES['exif:GPSLatitude'] = (['location', 'position_latitude'], 1) +KEY_TO_CATEGORIES['exif:GPSLongitude'] = (['location', 'position_longitude'], 1) +KEY_TO_CATEGORIES['exif:GPSSpeed'] = (['location'], 1) +KEY_TO_CATEGORIES['exif:GPSSpeedRef'] = (['location'], 1) +KEY_TO_CATEGORIES['exif:GPSStatus'] = (['location'], 1) +KEY_TO_CATEGORIES['exif:GPSDateTime'] = (['time'], 1) +KEY_TO_CATEGORIES['exif:GPSTimeStamp'] = (['time'], 1) + +# Photoshop tags +KEY_TO_CATEGORIES['photoshop:AuthorsPosition'] = (['author'], 1) +KEY_TO_CATEGORIES['photoshop:CaptionWriter'] = (['author'], 1) +KEY_TO_CATEGORIES['photoshop:City'] = (['location'], 1) +KEY_TO_CATEGORIES['photoshop:Country'] = (['location'], 1) +KEY_TO_CATEGORIES['photoshop:Credit'] = (['author'], 1) +KEY_TO_CATEGORIES['photoshop:DateCreated'] = (['time', 'creation_time'], 1) +KEY_TO_CATEGORIES['photoshop:Credit'] = (['author'], 1) +KEY_TO_CATEGORIES['photoshop:History'] = (['author', 'time'], 1) +KEY_TO_CATEGORIES['photoshop:Source'] = (['author'], 1) + + +class XMP_Analyser: + def name(self): + return 'XMP' + + def extract_metadata(self, path_to_file: str) -> dict: + if os.name == 'nt': + return list() + metadata = self._extract_metadata(path_to_file=path_to_file) + metadata = self._enrich_with_categories(metadata=metadata) + return metadata + + def _enrich_with_categories(self, metadata: dict) -> dict: + enriched_metadata = list() + for key, value, describtion in metadata: + category, vlevel = KEY_TO_CATEGORIES.get(key, (list(), 3)) + enriched_metadata.append((key, value, describtion, category, vlevel)) + return enriched_metadata + + def _extract_metadata(self, path_to_file): + try: + xmp_meta_data = file_to_dict(path_to_file) + meta_data_entries = list() + for level_1_key in xmp_meta_data.keys(): + for key, value, parameters in xmp_meta_data[level_1_key]: + meta_data_entries.append((key, value, 'embedded XMP metadata')) + return meta_data_entries + except libxmp.ExempliLoadError: + return list() + + +ANALYSER = XMP_Analyser \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..d1bb521 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,6 @@ +argparse +PyPDF2 +python-xmp-toolkit +Pillow +zipfile36 +lxml