diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..af3dbf1
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,9 @@
+.idea/
+.ipynb_checkpoints/
+**hdf5/
+**.h5
+src/notebooks/junk*
+src/results_analyze/__init__.py
+src/results_analyze/__pycache__/
+src/notebooks/dask-worker-space/
+src/results_analyze/data/credentials.json
\ No newline at end of file
diff --git a/src/notebooks/load_results_files.ipynb b/src/notebooks/load_results_files.ipynb
new file mode 100644
index 0000000..84083fb
--- /dev/null
+++ b/src/notebooks/load_results_files.ipynb
@@ -0,0 +1,1261 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# `load_results_file.py`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import sys\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "sys.path.append('/home/ayan/Desktop/nexB/gsoc20/scancode-results-analyzer/src')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Simulating Data going into `ResultsDataFrameFile.create_file_level_dataframe` function, which is called by `ResultsDataFramePackage.create_package_level_dataframe`.\n",
+ "Using code snippets from `ResultsDataFramePackage.create_package_level_dataframe`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from results_analyze.load_results_package import ResultsDataFramePackage\n",
+ "results_package = ResultsDataFramePackage()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "SELECT path, content FROM clearcode_cditem WHERE path like '%/scancode/%'OFFSET 0 ROWS FETCH FIRST 20 ROW ONLY;\n"
+ ]
+ }
+ ],
+ "source": [
+ "path_json_dataframe = results_package.convert_records_to_json(20)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Creates `files_dataframe` and breaks at a good example, `file_list` is passed into `ResultsDataFrameFile.create_file_level_dataframe`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 40,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "files_dataframe, metadata_dataframe = results_package.modify_package_level_dataframe(path_json_dataframe)\n",
+ "\n",
+ "for package_scan_result in files_dataframe.itertuples():\n",
+ " file_list = package_scan_result[2]\n",
+ " if package_scan_result[0] == 1:\n",
+ " break"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "list"
+ ]
+ },
+ "execution_count": 41,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "type(file_list)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 42,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(295,)"
+ ]
+ },
+ "execution_count": 42,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "np.shape(file_list)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "One of the entries inside the list of dicts."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 43,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'path': 'ccxt-1.28.76/package.json',\n",
+ " 'type': 'file',\n",
+ " 'name': 'package.json',\n",
+ " 'base_name': 'package',\n",
+ " 'extension': '.json',\n",
+ " 'size': 12024,\n",
+ " 'date': '2020-05-26',\n",
+ " 'sha1': 'a6d104bee2f3a7610bb10ad379bfb6b0d3466f89',\n",
+ " 'md5': '122169b27f6d0a1688a4d21bd3a2d4f6',\n",
+ " 'mime_type': 'text/plain',\n",
+ " 'file_type': 'ASCII text, with very long lines',\n",
+ " 'programming_language': None,\n",
+ " 'is_binary': False,\n",
+ " 'is_text': True,\n",
+ " 'is_archive': False,\n",
+ " 'is_media': False,\n",
+ " 'is_source': False,\n",
+ " 'is_script': False,\n",
+ " 'licenses': [{'key': 'mit',\n",
+ " 'score': 99,\n",
+ " 'name': 'MIT License',\n",
+ " 'short_name': 'MIT License',\n",
+ " 'category': 'Permissive',\n",
+ " 'is_exception': False,\n",
+ " 'owner': 'MIT',\n",
+ " 'homepage_url': 'http://opensource.org/licenses/mit-license.php',\n",
+ " 'text_url': 'http://opensource.org/licenses/mit-license.php',\n",
+ " 'reference_url': 'https://enterprise.dejacode.com/urn/urn:dje:license:mit',\n",
+ " 'spdx_license_key': 'MIT',\n",
+ " 'spdx_url': 'https://spdx.org/licenses/MIT',\n",
+ " 'start_line': 72,\n",
+ " 'end_line': 72,\n",
+ " 'matched_rule': {'identifier': 'mit_34.RULE',\n",
+ " 'license_expression': 'mit',\n",
+ " 'licenses': ['mit'],\n",
+ " 'is_license_text': False,\n",
+ " 'is_license_notice': False,\n",
+ " 'is_license_reference': False,\n",
+ " 'is_license_tag': True,\n",
+ " 'matcher': '2-aho',\n",
+ " 'rule_length': 2,\n",
+ " 'matched_length': 2,\n",
+ " 'match_coverage': 100,\n",
+ " 'rule_relevance': 99},\n",
+ " 'matched_text': 'license\": \"MIT\",'}],\n",
+ " 'license_expressions': ['mit'],\n",
+ " 'holders': [],\n",
+ " 'copyrights': [],\n",
+ " 'authors': [],\n",
+ " 'packages': [],\n",
+ " 'emails': [{'email': 'igor.kroitor@gmail.com',\n",
+ " 'start_line': 69,\n",
+ " 'end_line': 69}],\n",
+ " 'urls': [{'url': 'https://github.com/ccxt/ccxt.git',\n",
+ " 'start_line': 12,\n",
+ " 'end_line': 12},\n",
+ " {'url': 'https://github.com/kroitor', 'start_line': 70, 'end_line': 70},\n",
+ " {'url': 'https://github.com/ccxt/ccxt/issues',\n",
+ " 'start_line': 74,\n",
+ " 'end_line': 74},\n",
+ " {'url': 'https://ccxt.trade/', 'start_line': 76, 'end_line': 76},\n",
+ " {'url': 'https://opencollective.com/ccxt',\n",
+ " 'start_line': 522,\n",
+ " 'end_line': 522},\n",
+ " {'url': 'https://opencollective.com/ccxt/logo.txt',\n",
+ " 'start_line': 523,\n",
+ " 'end_line': 523}],\n",
+ " 'is_legal': False,\n",
+ " 'is_manifest': True,\n",
+ " 'is_readme': False,\n",
+ " 'is_top_level': True,\n",
+ " 'is_key_file': True,\n",
+ " 'is_generated': False,\n",
+ " 'is_license_text': False,\n",
+ " 'files_count': 0,\n",
+ " 'dirs_count': 0,\n",
+ " 'size_count': 0,\n",
+ " 'scan_errors': []}"
+ ]
+ },
+ "execution_count": 43,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "file_list[3]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Loads List of Dicts into DataFrame"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 44,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from results_analyze.load_results_file import ResultsDataFrameFile\n",
+ "results_file = ResultsDataFrameFile()\n",
+ "file_level_dataframe = pd.DataFrame(file_list)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Viewing DataFrame Columns and their types by calling `DataFrame.dtypes`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 45,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "path object\n",
+ "type object\n",
+ "name object\n",
+ "base_name object\n",
+ "extension object\n",
+ "size int64\n",
+ "date object\n",
+ "sha1 object\n",
+ "md5 object\n",
+ "mime_type object\n",
+ "file_type object\n",
+ "programming_language object\n",
+ "is_binary bool\n",
+ "is_text bool\n",
+ "is_archive bool\n",
+ "is_media bool\n",
+ "is_source bool\n",
+ "is_script bool\n",
+ "licenses object\n",
+ "license_expressions object\n",
+ "holders object\n",
+ "copyrights object\n",
+ "authors object\n",
+ "packages object\n",
+ "emails object\n",
+ "urls object\n",
+ "is_legal bool\n",
+ "is_manifest bool\n",
+ "is_readme bool\n",
+ "is_top_level bool\n",
+ "is_key_file bool\n",
+ "is_generated bool\n",
+ "is_license_text bool\n",
+ "files_count int64\n",
+ "dirs_count int64\n",
+ "size_count int64\n",
+ "scan_errors object\n",
+ "dtype: object"
+ ]
+ },
+ "execution_count": 45,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "file_level_dataframe.dtypes"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 63,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(11, 20)"
+ ]
+ },
+ "execution_count": 63,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "file_level_dataframe.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 64,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "\u001b[0;31mSignature:\u001b[0m \u001b[0mresults_file\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodify_file_level_dataframe\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdataframe_files\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;31mDocstring:\u001b[0m\n",
+ "Takes a File Level DataFrame, drops unnecessary columns, drops all directory rows, drops same files,\n",
+ "drop files with no license detections, and makes sha1 column as the file level Index [Primary Key].\n",
+ "\n",
+ ":param dataframe_files: pd.DataFrame\n",
+ " File Level DataFrames\n",
+ "\u001b[0;31mFile:\u001b[0m ~/Desktop/nexB/gsoc20/scancode-results-analyzer/src/results_analyze/load_results_file.py\n",
+ "\u001b[0;31mType:\u001b[0m method\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "results_file.modify_file_level_dataframe?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 47,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "results_file.modify_file_level_dataframe(file_level_dataframe)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 48,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(11, 20)"
+ ]
+ },
+ "execution_count": 48,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "file_level_dataframe.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 49,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "path object\n",
+ "size int64\n",
+ "mime_type object\n",
+ "file_type object\n",
+ "programming_language object\n",
+ "is_binary bool\n",
+ "is_text bool\n",
+ "is_archive bool\n",
+ "is_media bool\n",
+ "is_source bool\n",
+ "is_script bool\n",
+ "licenses object\n",
+ "is_legal bool\n",
+ "is_manifest bool\n",
+ "is_readme bool\n",
+ "is_top_level bool\n",
+ "is_key_file bool\n",
+ "is_generated bool\n",
+ "is_license_text bool\n",
+ "license_detections_no int64\n",
+ "dtype: object"
+ ]
+ },
+ "execution_count": 49,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "file_level_dataframe.dtypes"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Some entries inside `file_level_dataframe`, here `licenses` column contains list of dicts, where list length is number of license detections per file."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 50,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " path | \n",
+ " size | \n",
+ " mime_type | \n",
+ " file_type | \n",
+ " programming_language | \n",
+ " is_binary | \n",
+ " is_text | \n",
+ " is_archive | \n",
+ " is_media | \n",
+ " is_source | \n",
+ " is_script | \n",
+ " licenses | \n",
+ " is_legal | \n",
+ " is_manifest | \n",
+ " is_readme | \n",
+ " is_top_level | \n",
+ " is_key_file | \n",
+ " is_generated | \n",
+ " is_license_text | \n",
+ " license_detections_no | \n",
+ "
\n",
+ " \n",
+ " sha1 | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 3be892ba5a4cdc550ac746e257e8e944e79561de | \n",
+ " ccxt-1.28.76/LICENSE.txt | \n",
+ " 1068 | \n",
+ " text/plain | \n",
+ " UTF-8 Unicode text | \n",
+ " None | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " [{'key': 'mit', 'score': 99.4, 'name': 'MIT Li... | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " True | \n",
+ " False | \n",
+ " True | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " a6d104bee2f3a7610bb10ad379bfb6b0d3466f89 | \n",
+ " ccxt-1.28.76/package.json | \n",
+ " 12024 | \n",
+ " text/plain | \n",
+ " ASCII text, with very long lines | \n",
+ " None | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " [{'key': 'mit', 'score': 99, 'name': 'MIT Lice... | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " True | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 15ecee2af9d79cc7c86d42b31b8778faf61e7e35 | \n",
+ " ccxt-1.28.76/README.rst | \n",
+ " 104132 | \n",
+ " text/plain | \n",
+ " UTF-8 Unicode text, with very long lines | \n",
+ " None | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " [{'key': 'mit', 'score': 22, 'name': 'MIT Lice... | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 7e75216e98490df3995164be00d5d6d4ae8d63cf | \n",
+ " ccxt-1.28.76/setup.py | \n",
+ " 2619 | \n",
+ " text/x-python | \n",
+ " Python script, ASCII text executable | \n",
+ " Python | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " True | \n",
+ " [{'key': 'mit', 'score': 27, 'name': 'MIT Lice... | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " True | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " da97ffed59276dce11663a7ddb468112811debeb | \n",
+ " ccxt-1.28.76/ccxt/__init__.py | \n",
+ " 15038 | \n",
+ " text/x-python | \n",
+ " Python script, ASCII text executable | \n",
+ " Python | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " True | \n",
+ " [{'key': 'mit', 'score': 99.4, 'name': 'MIT Li... | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " path \\\n",
+ "sha1 \n",
+ "3be892ba5a4cdc550ac746e257e8e944e79561de ccxt-1.28.76/LICENSE.txt \n",
+ "a6d104bee2f3a7610bb10ad379bfb6b0d3466f89 ccxt-1.28.76/package.json \n",
+ "15ecee2af9d79cc7c86d42b31b8778faf61e7e35 ccxt-1.28.76/README.rst \n",
+ "7e75216e98490df3995164be00d5d6d4ae8d63cf ccxt-1.28.76/setup.py \n",
+ "da97ffed59276dce11663a7ddb468112811debeb ccxt-1.28.76/ccxt/__init__.py \n",
+ "\n",
+ " size mime_type \\\n",
+ "sha1 \n",
+ "3be892ba5a4cdc550ac746e257e8e944e79561de 1068 text/plain \n",
+ "a6d104bee2f3a7610bb10ad379bfb6b0d3466f89 12024 text/plain \n",
+ "15ecee2af9d79cc7c86d42b31b8778faf61e7e35 104132 text/plain \n",
+ "7e75216e98490df3995164be00d5d6d4ae8d63cf 2619 text/x-python \n",
+ "da97ffed59276dce11663a7ddb468112811debeb 15038 text/x-python \n",
+ "\n",
+ " file_type \\\n",
+ "sha1 \n",
+ "3be892ba5a4cdc550ac746e257e8e944e79561de UTF-8 Unicode text \n",
+ "a6d104bee2f3a7610bb10ad379bfb6b0d3466f89 ASCII text, with very long lines \n",
+ "15ecee2af9d79cc7c86d42b31b8778faf61e7e35 UTF-8 Unicode text, with very long lines \n",
+ "7e75216e98490df3995164be00d5d6d4ae8d63cf Python script, ASCII text executable \n",
+ "da97ffed59276dce11663a7ddb468112811debeb Python script, ASCII text executable \n",
+ "\n",
+ " programming_language is_binary \\\n",
+ "sha1 \n",
+ "3be892ba5a4cdc550ac746e257e8e944e79561de None False \n",
+ "a6d104bee2f3a7610bb10ad379bfb6b0d3466f89 None False \n",
+ "15ecee2af9d79cc7c86d42b31b8778faf61e7e35 None False \n",
+ "7e75216e98490df3995164be00d5d6d4ae8d63cf Python False \n",
+ "da97ffed59276dce11663a7ddb468112811debeb Python False \n",
+ "\n",
+ " is_text is_archive is_media \\\n",
+ "sha1 \n",
+ "3be892ba5a4cdc550ac746e257e8e944e79561de True False False \n",
+ "a6d104bee2f3a7610bb10ad379bfb6b0d3466f89 True False False \n",
+ "15ecee2af9d79cc7c86d42b31b8778faf61e7e35 True False False \n",
+ "7e75216e98490df3995164be00d5d6d4ae8d63cf True False False \n",
+ "da97ffed59276dce11663a7ddb468112811debeb True False False \n",
+ "\n",
+ " is_source is_script \\\n",
+ "sha1 \n",
+ "3be892ba5a4cdc550ac746e257e8e944e79561de False False \n",
+ "a6d104bee2f3a7610bb10ad379bfb6b0d3466f89 False False \n",
+ "15ecee2af9d79cc7c86d42b31b8778faf61e7e35 False False \n",
+ "7e75216e98490df3995164be00d5d6d4ae8d63cf True True \n",
+ "da97ffed59276dce11663a7ddb468112811debeb True True \n",
+ "\n",
+ " licenses \\\n",
+ "sha1 \n",
+ "3be892ba5a4cdc550ac746e257e8e944e79561de [{'key': 'mit', 'score': 99.4, 'name': 'MIT Li... \n",
+ "a6d104bee2f3a7610bb10ad379bfb6b0d3466f89 [{'key': 'mit', 'score': 99, 'name': 'MIT Lice... \n",
+ "15ecee2af9d79cc7c86d42b31b8778faf61e7e35 [{'key': 'mit', 'score': 22, 'name': 'MIT Lice... \n",
+ "7e75216e98490df3995164be00d5d6d4ae8d63cf [{'key': 'mit', 'score': 27, 'name': 'MIT Lice... \n",
+ "da97ffed59276dce11663a7ddb468112811debeb [{'key': 'mit', 'score': 99.4, 'name': 'MIT Li... \n",
+ "\n",
+ " is_legal is_manifest is_readme \\\n",
+ "sha1 \n",
+ "3be892ba5a4cdc550ac746e257e8e944e79561de True False False \n",
+ "a6d104bee2f3a7610bb10ad379bfb6b0d3466f89 False True False \n",
+ "15ecee2af9d79cc7c86d42b31b8778faf61e7e35 False False True \n",
+ "7e75216e98490df3995164be00d5d6d4ae8d63cf False True False \n",
+ "da97ffed59276dce11663a7ddb468112811debeb False False False \n",
+ "\n",
+ " is_top_level is_key_file \\\n",
+ "sha1 \n",
+ "3be892ba5a4cdc550ac746e257e8e944e79561de True True \n",
+ "a6d104bee2f3a7610bb10ad379bfb6b0d3466f89 True True \n",
+ "15ecee2af9d79cc7c86d42b31b8778faf61e7e35 True True \n",
+ "7e75216e98490df3995164be00d5d6d4ae8d63cf True True \n",
+ "da97ffed59276dce11663a7ddb468112811debeb False False \n",
+ "\n",
+ " is_generated is_license_text \\\n",
+ "sha1 \n",
+ "3be892ba5a4cdc550ac746e257e8e944e79561de False True \n",
+ "a6d104bee2f3a7610bb10ad379bfb6b0d3466f89 False False \n",
+ "15ecee2af9d79cc7c86d42b31b8778faf61e7e35 False False \n",
+ "7e75216e98490df3995164be00d5d6d4ae8d63cf False False \n",
+ "da97ffed59276dce11663a7ddb468112811debeb False False \n",
+ "\n",
+ " license_detections_no \n",
+ "sha1 \n",
+ "3be892ba5a4cdc550ac746e257e8e944e79561de 1 \n",
+ "a6d104bee2f3a7610bb10ad379bfb6b0d3466f89 1 \n",
+ "15ecee2af9d79cc7c86d42b31b8778faf61e7e35 1 \n",
+ "7e75216e98490df3995164be00d5d6d4ae8d63cf 1 \n",
+ "da97ffed59276dce11663a7ddb468112811debeb 1 "
+ ]
+ },
+ "execution_count": 50,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "file_level_dataframe.head(5)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "These lines takes out all these licenses into `DataFrames`. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 51,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "lic_level_dataframe = file_level_dataframe.groupby('sha1').licenses.apply(lambda x: pd.DataFrame(x.values[0])).reset_index()\n",
+ "lic_level_dataframe.rename(columns={'level_1': 'lic_det_num'}, inplace=True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "These are only license level information in the columns."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 52,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "sha1 object\n",
+ "lic_det_num int64\n",
+ "key object\n",
+ "score float64\n",
+ "name object\n",
+ "short_name object\n",
+ "category object\n",
+ "is_exception bool\n",
+ "owner object\n",
+ "homepage_url object\n",
+ "text_url object\n",
+ "reference_url object\n",
+ "spdx_license_key object\n",
+ "spdx_url object\n",
+ "start_line int64\n",
+ "end_line int64\n",
+ "matched_rule object\n",
+ "matched_text object\n",
+ "dtype: object"
+ ]
+ },
+ "execution_count": 52,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "lic_level_dataframe.dtypes"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 65,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "\u001b[0;31mSignature:\u001b[0m \u001b[0mresults_file\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodify_lic_level_dataframe\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdataframe_lic\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;31mDocstring:\u001b[0m\n",
+ "Modifies License level DataFrame, from 'matched_rule' dicts, bring information to columns.\n",
+ "Maps Rule Names and other strings to integer values to compress.\n",
+ "\n",
+ ":param dataframe_lic: pd.DataFrame\n",
+ ":return dataframe_lic: pd.DataFrame\n",
+ "\u001b[0;31mFile:\u001b[0m ~/Desktop/nexB/gsoc20/scancode-results-analyzer/src/results_analyze/load_results_file.py\n",
+ "\u001b[0;31mType:\u001b[0m method\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "results_file.modify_lic_level_dataframe?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 53,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "lic_level_dataframe = results_file.modify_lic_level_dataframe(lic_level_dataframe)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 66,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "lic_det_num int64\n",
+ "key object\n",
+ "score float64\n",
+ "category object\n",
+ "is_exception bool\n",
+ "start_line int64\n",
+ "end_line int64\n",
+ "matched_text object\n",
+ "identifier object\n",
+ "is_license_text bool\n",
+ "is_license_notice bool\n",
+ "is_license_reference bool\n",
+ "is_license_tag bool\n",
+ "matcher object\n",
+ "rule_length int64\n",
+ "matched_length int64\n",
+ "match_coverage float64\n",
+ "rule_relevance int64\n",
+ "dtype: object"
+ ]
+ },
+ "execution_count": 66,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "lic_level_dataframe.dtypes"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 55,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "lic_level_dataframe.set_index('sha1', inplace=True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Joins License level and File level Dataframes using Join operations, by the primary key `sha1`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 56,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "merged_df = file_level_dataframe.join(lic_level_dataframe, lsuffix='_file', rsuffix='_lic')\n",
+ "merged_df.reset_index(inplace=True)\n",
+ "merged_df.set_index(['sha1', 'lic_det_num'], inplace=True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Notice how under one file, there can be many license rows, and there are 2 Primary key columns on the left, where there is a one-to-many relationship."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 58,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " | \n",
+ " path | \n",
+ " size | \n",
+ " mime_type | \n",
+ " file_type | \n",
+ " programming_language | \n",
+ " is_binary | \n",
+ " is_text | \n",
+ " is_archive | \n",
+ " is_media | \n",
+ " is_source | \n",
+ " ... | \n",
+ " identifier | \n",
+ " is_license_text_lic | \n",
+ " is_license_notice | \n",
+ " is_license_reference | \n",
+ " is_license_tag | \n",
+ " matcher | \n",
+ " rule_length | \n",
+ " matched_length | \n",
+ " match_coverage | \n",
+ " rule_relevance | \n",
+ "
\n",
+ " \n",
+ " sha1 | \n",
+ " lic_det_num | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 15ecee2af9d79cc7c86d42b31b8778faf61e7e35 | \n",
+ " 0 | \n",
+ " ccxt-1.28.76/README.rst | \n",
+ " 104132 | \n",
+ " text/plain | \n",
+ " UTF-8 Unicode text, with very long lines | \n",
+ " None | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " ... | \n",
+ " mit_77.RULE | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " 2-aho | \n",
+ " 4 | \n",
+ " 4 | \n",
+ " 100.0 | \n",
+ " 22 | \n",
+ "
\n",
+ " \n",
+ " 1c3a8a412cca20678559fce34b46686c39a835c0 | \n",
+ " 0 | \n",
+ " ccxt-1.28.76/ccxt.egg-info/PKG-INFO | \n",
+ " 115136 | \n",
+ " text/plain | \n",
+ " UTF-8 Unicode text, with very long lines | \n",
+ " None | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " ... | \n",
+ " mit_34.RULE | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " 2-aho | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 100.0 | \n",
+ " 99 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " ccxt-1.28.76/ccxt.egg-info/PKG-INFO | \n",
+ " 115136 | \n",
+ " text/plain | \n",
+ " UTF-8 Unicode text, with very long lines | \n",
+ " None | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " ... | \n",
+ " mit_77.RULE | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " 2-aho | \n",
+ " 4 | \n",
+ " 4 | \n",
+ " 100.0 | \n",
+ " 22 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " ccxt-1.28.76/ccxt.egg-info/PKG-INFO | \n",
+ " 115136 | \n",
+ " text/plain | \n",
+ " UTF-8 Unicode text, with very long lines | \n",
+ " None | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " ... | \n",
+ " pypi_mit_license.RULE | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " 2-aho | \n",
+ " 5 | \n",
+ " 5 | \n",
+ " 100.0 | \n",
+ " 27 | \n",
+ "
\n",
+ " \n",
+ " 3be892ba5a4cdc550ac746e257e8e944e79561de | \n",
+ " 0 | \n",
+ " ccxt-1.28.76/LICENSE.txt | \n",
+ " 1068 | \n",
+ " text/plain | \n",
+ " UTF-8 Unicode text | \n",
+ " None | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " ... | \n",
+ " mit_160.RULE | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " 3-seq | \n",
+ " 167 | \n",
+ " 166 | \n",
+ " 99.4 | \n",
+ " 100 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 37 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " path \\\n",
+ "sha1 lic_det_num \n",
+ "15ecee2af9d79cc7c86d42b31b8778faf61e7e35 0 ccxt-1.28.76/README.rst \n",
+ "1c3a8a412cca20678559fce34b46686c39a835c0 0 ccxt-1.28.76/ccxt.egg-info/PKG-INFO \n",
+ " 1 ccxt-1.28.76/ccxt.egg-info/PKG-INFO \n",
+ " 2 ccxt-1.28.76/ccxt.egg-info/PKG-INFO \n",
+ "3be892ba5a4cdc550ac746e257e8e944e79561de 0 ccxt-1.28.76/LICENSE.txt \n",
+ "\n",
+ " size mime_type \\\n",
+ "sha1 lic_det_num \n",
+ "15ecee2af9d79cc7c86d42b31b8778faf61e7e35 0 104132 text/plain \n",
+ "1c3a8a412cca20678559fce34b46686c39a835c0 0 115136 text/plain \n",
+ " 1 115136 text/plain \n",
+ " 2 115136 text/plain \n",
+ "3be892ba5a4cdc550ac746e257e8e944e79561de 0 1068 text/plain \n",
+ "\n",
+ " file_type \\\n",
+ "sha1 lic_det_num \n",
+ "15ecee2af9d79cc7c86d42b31b8778faf61e7e35 0 UTF-8 Unicode text, with very long lines \n",
+ "1c3a8a412cca20678559fce34b46686c39a835c0 0 UTF-8 Unicode text, with very long lines \n",
+ " 1 UTF-8 Unicode text, with very long lines \n",
+ " 2 UTF-8 Unicode text, with very long lines \n",
+ "3be892ba5a4cdc550ac746e257e8e944e79561de 0 UTF-8 Unicode text \n",
+ "\n",
+ " programming_language \\\n",
+ "sha1 lic_det_num \n",
+ "15ecee2af9d79cc7c86d42b31b8778faf61e7e35 0 None \n",
+ "1c3a8a412cca20678559fce34b46686c39a835c0 0 None \n",
+ " 1 None \n",
+ " 2 None \n",
+ "3be892ba5a4cdc550ac746e257e8e944e79561de 0 None \n",
+ "\n",
+ " is_binary is_text \\\n",
+ "sha1 lic_det_num \n",
+ "15ecee2af9d79cc7c86d42b31b8778faf61e7e35 0 False True \n",
+ "1c3a8a412cca20678559fce34b46686c39a835c0 0 False True \n",
+ " 1 False True \n",
+ " 2 False True \n",
+ "3be892ba5a4cdc550ac746e257e8e944e79561de 0 False True \n",
+ "\n",
+ " is_archive is_media \\\n",
+ "sha1 lic_det_num \n",
+ "15ecee2af9d79cc7c86d42b31b8778faf61e7e35 0 False False \n",
+ "1c3a8a412cca20678559fce34b46686c39a835c0 0 False False \n",
+ " 1 False False \n",
+ " 2 False False \n",
+ "3be892ba5a4cdc550ac746e257e8e944e79561de 0 False False \n",
+ "\n",
+ " is_source ... \\\n",
+ "sha1 lic_det_num ... \n",
+ "15ecee2af9d79cc7c86d42b31b8778faf61e7e35 0 False ... \n",
+ "1c3a8a412cca20678559fce34b46686c39a835c0 0 False ... \n",
+ " 1 False ... \n",
+ " 2 False ... \n",
+ "3be892ba5a4cdc550ac746e257e8e944e79561de 0 False ... \n",
+ "\n",
+ " identifier \\\n",
+ "sha1 lic_det_num \n",
+ "15ecee2af9d79cc7c86d42b31b8778faf61e7e35 0 mit_77.RULE \n",
+ "1c3a8a412cca20678559fce34b46686c39a835c0 0 mit_34.RULE \n",
+ " 1 mit_77.RULE \n",
+ " 2 pypi_mit_license.RULE \n",
+ "3be892ba5a4cdc550ac746e257e8e944e79561de 0 mit_160.RULE \n",
+ "\n",
+ " is_license_text_lic \\\n",
+ "sha1 lic_det_num \n",
+ "15ecee2af9d79cc7c86d42b31b8778faf61e7e35 0 False \n",
+ "1c3a8a412cca20678559fce34b46686c39a835c0 0 False \n",
+ " 1 False \n",
+ " 2 False \n",
+ "3be892ba5a4cdc550ac746e257e8e944e79561de 0 True \n",
+ "\n",
+ " is_license_notice \\\n",
+ "sha1 lic_det_num \n",
+ "15ecee2af9d79cc7c86d42b31b8778faf61e7e35 0 False \n",
+ "1c3a8a412cca20678559fce34b46686c39a835c0 0 False \n",
+ " 1 False \n",
+ " 2 False \n",
+ "3be892ba5a4cdc550ac746e257e8e944e79561de 0 False \n",
+ "\n",
+ " is_license_reference \\\n",
+ "sha1 lic_det_num \n",
+ "15ecee2af9d79cc7c86d42b31b8778faf61e7e35 0 True \n",
+ "1c3a8a412cca20678559fce34b46686c39a835c0 0 False \n",
+ " 1 True \n",
+ " 2 False \n",
+ "3be892ba5a4cdc550ac746e257e8e944e79561de 0 False \n",
+ "\n",
+ " is_license_tag matcher \\\n",
+ "sha1 lic_det_num \n",
+ "15ecee2af9d79cc7c86d42b31b8778faf61e7e35 0 False 2-aho \n",
+ "1c3a8a412cca20678559fce34b46686c39a835c0 0 True 2-aho \n",
+ " 1 False 2-aho \n",
+ " 2 True 2-aho \n",
+ "3be892ba5a4cdc550ac746e257e8e944e79561de 0 False 3-seq \n",
+ "\n",
+ " rule_length \\\n",
+ "sha1 lic_det_num \n",
+ "15ecee2af9d79cc7c86d42b31b8778faf61e7e35 0 4 \n",
+ "1c3a8a412cca20678559fce34b46686c39a835c0 0 2 \n",
+ " 1 4 \n",
+ " 2 5 \n",
+ "3be892ba5a4cdc550ac746e257e8e944e79561de 0 167 \n",
+ "\n",
+ " matched_length \\\n",
+ "sha1 lic_det_num \n",
+ "15ecee2af9d79cc7c86d42b31b8778faf61e7e35 0 4 \n",
+ "1c3a8a412cca20678559fce34b46686c39a835c0 0 2 \n",
+ " 1 4 \n",
+ " 2 5 \n",
+ "3be892ba5a4cdc550ac746e257e8e944e79561de 0 166 \n",
+ "\n",
+ " match_coverage \\\n",
+ "sha1 lic_det_num \n",
+ "15ecee2af9d79cc7c86d42b31b8778faf61e7e35 0 100.0 \n",
+ "1c3a8a412cca20678559fce34b46686c39a835c0 0 100.0 \n",
+ " 1 100.0 \n",
+ " 2 100.0 \n",
+ "3be892ba5a4cdc550ac746e257e8e944e79561de 0 99.4 \n",
+ "\n",
+ " rule_relevance \n",
+ "sha1 lic_det_num \n",
+ "15ecee2af9d79cc7c86d42b31b8778faf61e7e35 0 22 \n",
+ "1c3a8a412cca20678559fce34b46686c39a835c0 0 99 \n",
+ " 1 22 \n",
+ " 2 27 \n",
+ "3be892ba5a4cdc550ac746e257e8e944e79561de 0 100 \n",
+ "\n",
+ "[5 rows x 37 columns]"
+ ]
+ },
+ "execution_count": 58,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "merged_df.head(5)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "This is returned to the `create_package_level_dataframe` function in the package level, where this happends for every row, i.e. every package. They all get merged into One main dataframe."
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.10"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/src/notebooks/load_results_package.ipynb b/src/notebooks/load_results_package.ipynb
new file mode 100644
index 0000000..063e63c
--- /dev/null
+++ b/src/notebooks/load_results_package.ipynb
@@ -0,0 +1,2039 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import sys\n",
+ "sys.path.append('/home/ayan/Desktop/nexB/gsoc20/scancode-results-analyzer/src')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Import Class `ResultsDataFramePackage` and initialize Object"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from results_analyze.load_results_package import ResultsDataFramePackage\n",
+ "pkg_class = ResultsDataFramePackage()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Fetch Data from Postgres Database, and de-compress memoryview objects - `ResultsDataFramePackage.convert_records_to_json`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " path | \n",
+ " json_content | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " git/github/zzarcon/video-snapshot/revision/b56... | \n",
+ " {'_metadata': {'type': 'scancode', 'url': 'cd:... | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " pypi/pypi/-/ccxt/revision/1.28.76/tool/scancod... | \n",
+ " {'_metadata': {'type': 'scancode', 'url': 'cd:... | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " composer/packagist/motor-cms/motor-core/revisi... | \n",
+ " {'_metadata': {'type': 'scancode', 'url': 'cd:... | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " composer/packagist/lucatume/wp-browser/revisio... | \n",
+ " {'_metadata': {'type': 'scancode', 'url': 'cd:... | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " npm/npmjs/-/aws-sdk/revision/2.687.0/tool/scan... | \n",
+ " {'_metadata': {'type': 'scancode', 'url': 'cd:... | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " npm/npmjs/@types/babel__core/revision/7.1.8/to... | \n",
+ " {'_metadata': {'type': 'scancode', 'url': 'cd:... | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " composer/packagist/topshelfcraft/wordsmith/rev... | \n",
+ " {'_metadata': {'type': 'scancode', 'url': 'cd:... | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " pypi/pypi/-/youtube_dl/revision/2014.01.22.4/t... | \n",
+ " {'_metadata': {'type': 'scancode', 'url': 'cd:... | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " composer/packagist/qingbing/php-file-cache/rev... | \n",
+ " {'_metadata': {'type': 'scancode', 'url': 'cd:... | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " git/github/fnando/browser/revision/9ada0e23745... | \n",
+ " {'_metadata': {'type': 'scancode', 'url': 'cd:... | \n",
+ "
\n",
+ " \n",
+ " 10 | \n",
+ " composer/packagist/phpro/grumphp/revision/0.16... | \n",
+ " {'_metadata': {'type': 'scancode', 'url': 'cd:... | \n",
+ "
\n",
+ " \n",
+ " 11 | \n",
+ " pypi/pypi/-/yunbk/revision/0.1.59/tool/scancod... | \n",
+ " {'_metadata': {'type': 'scancode', 'url': 'cd:... | \n",
+ "
\n",
+ " \n",
+ " 12 | \n",
+ " sourcearchive/mavencentral/org.wso2.siddhi/sid... | \n",
+ " {'_metadata': {'type': 'scancode', 'url': 'cd:... | \n",
+ "
\n",
+ " \n",
+ " 13 | \n",
+ " npm/npmjs/-/gatsby-plugin-offline/revision/3.2... | \n",
+ " {'_metadata': {'type': 'scancode', 'url': 'cd:... | \n",
+ "
\n",
+ " \n",
+ " 14 | \n",
+ " npm/npmjs/@typescript-eslint/experimental-util... | \n",
+ " {'_metadata': {'type': 'scancode', 'url': 'cd:... | \n",
+ "
\n",
+ " \n",
+ " 15 | \n",
+ " npm/npmjs/@juggle/resize-observer/revision/3.2... | \n",
+ " {'_metadata': {'type': 'scancode', 'url': 'cd:... | \n",
+ "
\n",
+ " \n",
+ " 16 | \n",
+ " composer/packagist/symfony/thanks/revision/1.2... | \n",
+ " {'_metadata': {'type': 'scancode', 'url': 'cd:... | \n",
+ "
\n",
+ " \n",
+ " 17 | \n",
+ " sourcearchive/mavencentral/org.wso2.carbon.ide... | \n",
+ " {'_metadata': {'type': 'scancode', 'url': 'cd:... | \n",
+ "
\n",
+ " \n",
+ " 18 | \n",
+ " pypi/pypi/-/uiautomator2/revision/0.1.4.dev30/... | \n",
+ " {'_metadata': {'type': 'scancode', 'url': 'cd:... | \n",
+ "
\n",
+ " \n",
+ " 19 | \n",
+ " npm/npmjs/@storybook/addons/revision/6.0.0-bet... | \n",
+ " {'_metadata': {'type': 'scancode', 'url': 'cd:... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " path \\\n",
+ "0 git/github/zzarcon/video-snapshot/revision/b56... \n",
+ "1 pypi/pypi/-/ccxt/revision/1.28.76/tool/scancod... \n",
+ "2 composer/packagist/motor-cms/motor-core/revisi... \n",
+ "3 composer/packagist/lucatume/wp-browser/revisio... \n",
+ "4 npm/npmjs/-/aws-sdk/revision/2.687.0/tool/scan... \n",
+ "5 npm/npmjs/@types/babel__core/revision/7.1.8/to... \n",
+ "6 composer/packagist/topshelfcraft/wordsmith/rev... \n",
+ "7 pypi/pypi/-/youtube_dl/revision/2014.01.22.4/t... \n",
+ "8 composer/packagist/qingbing/php-file-cache/rev... \n",
+ "9 git/github/fnando/browser/revision/9ada0e23745... \n",
+ "10 composer/packagist/phpro/grumphp/revision/0.16... \n",
+ "11 pypi/pypi/-/yunbk/revision/0.1.59/tool/scancod... \n",
+ "12 sourcearchive/mavencentral/org.wso2.siddhi/sid... \n",
+ "13 npm/npmjs/-/gatsby-plugin-offline/revision/3.2... \n",
+ "14 npm/npmjs/@typescript-eslint/experimental-util... \n",
+ "15 npm/npmjs/@juggle/resize-observer/revision/3.2... \n",
+ "16 composer/packagist/symfony/thanks/revision/1.2... \n",
+ "17 sourcearchive/mavencentral/org.wso2.carbon.ide... \n",
+ "18 pypi/pypi/-/uiautomator2/revision/0.1.4.dev30/... \n",
+ "19 npm/npmjs/@storybook/addons/revision/6.0.0-bet... \n",
+ "\n",
+ " json_content \n",
+ "0 {'_metadata': {'type': 'scancode', 'url': 'cd:... \n",
+ "1 {'_metadata': {'type': 'scancode', 'url': 'cd:... \n",
+ "2 {'_metadata': {'type': 'scancode', 'url': 'cd:... \n",
+ "3 {'_metadata': {'type': 'scancode', 'url': 'cd:... \n",
+ "4 {'_metadata': {'type': 'scancode', 'url': 'cd:... \n",
+ "5 {'_metadata': {'type': 'scancode', 'url': 'cd:... \n",
+ "6 {'_metadata': {'type': 'scancode', 'url': 'cd:... \n",
+ "7 {'_metadata': {'type': 'scancode', 'url': 'cd:... \n",
+ "8 {'_metadata': {'type': 'scancode', 'url': 'cd:... \n",
+ "9 {'_metadata': {'type': 'scancode', 'url': 'cd:... \n",
+ "10 {'_metadata': {'type': 'scancode', 'url': 'cd:... \n",
+ "11 {'_metadata': {'type': 'scancode', 'url': 'cd:... \n",
+ "12 {'_metadata': {'type': 'scancode', 'url': 'cd:... \n",
+ "13 {'_metadata': {'type': 'scancode', 'url': 'cd:... \n",
+ "14 {'_metadata': {'type': 'scancode', 'url': 'cd:... \n",
+ "15 {'_metadata': {'type': 'scancode', 'url': 'cd:... \n",
+ "16 {'_metadata': {'type': 'scancode', 'url': 'cd:... \n",
+ "17 {'_metadata': {'type': 'scancode', 'url': 'cd:... \n",
+ "18 {'_metadata': {'type': 'scancode', 'url': 'cd:... \n",
+ "19 {'_metadata': {'type': 'scancode', 'url': 'cd:... "
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "path_json_dataframe = pkg_class.convert_records_to_json(20)\n",
+ "path_json_dataframe"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "dict"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "type(path_json_dataframe['json_content'][0])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'type': 'scancode',\n",
+ " 'url': 'cd:/git/github/zzarcon/video-snapshot/b56386f225a2d082b1e56a0f24cfda595798c654',\n",
+ " 'fetchedAt': '2019-04-17T14:18:35.855Z',\n",
+ " 'links': {'self': {'href': 'urn:git:github:zzarcon:video-snapshot:revision:b56386f225a2d082b1e56a0f24cfda595798c654:tool:scancode:3.2.2',\n",
+ " 'type': 'resource'},\n",
+ " 'siblings': {'href': 'urn:git:github:zzarcon:video-snapshot:revision:b56386f225a2d082b1e56a0f24cfda595798c654:tool:scancode',\n",
+ " 'type': 'collection'}},\n",
+ " 'schemaVersion': '3.2.2',\n",
+ " 'toolVersion': '3.0.2',\n",
+ " 'contentType': 'application/json',\n",
+ " 'releaseDate': '2018-02-11T09:13:12.000Z',\n",
+ " 'processedAt': '2019-04-17T14:19:10.672Z'}"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "path_json_dataframe['json_content'][0]['_metadata']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[{'tool_name': 'scancode-toolkit',\n",
+ " 'tool_version': '3.0.2',\n",
+ " 'options': {'input': '/tmp/cd-rBieg5/video-snapshot',\n",
+ " '--classify': True,\n",
+ " '--copyright': True,\n",
+ " '--email': True,\n",
+ " '--generated': True,\n",
+ " '--info': True,\n",
+ " '--is-license-text': True,\n",
+ " '--json-pp': '/tmp/cd-4wjkc3',\n",
+ " '--license': True,\n",
+ " '--license-clarity-score': True,\n",
+ " '--license-diag': True,\n",
+ " '--license-text': True,\n",
+ " '--package': True,\n",
+ " '--processes': '2',\n",
+ " '--strip-root': True,\n",
+ " '--summary': True,\n",
+ " '--summary-key-files': True,\n",
+ " '--timeout': '1000.0',\n",
+ " '--url': True},\n",
+ " 'notice': 'Generated with ScanCode and provided on an \"AS IS\" BASIS, WITHOUT WARRANTIES\\nOR CONDITIONS OF ANY KIND, either express or implied. No content created from\\nScanCode should be considered or used as legal advice. Consult an Attorney\\nfor any legal advice.\\nScanCode is a free software code scanning tool from nexB Inc. and others.\\nVisit https://github.com/nexB/scancode-toolkit/ for support and download.',\n",
+ " 'start_timestamp': '2019-04-17T141841.687970',\n",
+ " 'end_timestamp': '2019-04-17T141909.130763',\n",
+ " 'message': None,\n",
+ " 'errors': [],\n",
+ " 'extra_data': {'files_count': 14}}]"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "path_json_dataframe['json_content'][0]['content']['headers']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "path object\n",
+ "json_content object\n",
+ "dtype: object"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "path_json_dataframe.dtypes"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Package level Dataframe is modified, and new columns are added from dicts inside the column"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "\u001b[0;31mSignature:\u001b[0m \u001b[0mpkg_class\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodify_package_level_dataframe\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmetadata_dataframe\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;31mDocstring:\u001b[0m\n",
+ "This function is applied to one column of a Dataframe containing json dicts, at once, to perform\n",
+ "vectorized data retrieval. Then convert this row of values/lists to dataframes.\n",
+ "The DataFrames column name is the `name_value`.\n",
+ "\n",
+ ":param metadata_dataframe : pd.DataFrame\n",
+ "\n",
+ ":returns\n",
+ "files_dataframe : pd.DataFrame\n",
+ " DataFrame, containing a two columns, which has the path_string in one, and has a list of dicts in each row\n",
+ " of the other column, which is list of file-level dicts.\n",
+ "metadata_dataframe : pd.DataFrame\n",
+ " DataFrame, containing a new column for the value/list, from inside the JSON dict.\n",
+ "\u001b[0;31mFile:\u001b[0m ~/Desktop/nexB/gsoc20/scancode-results-analyzer/src/results_analyze/load_results_package.py\n",
+ "\u001b[0;31mType:\u001b[0m method\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "pkg_class.modify_package_level_dataframe?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "files_dataframe, metadata_dataframe = pkg_class.modify_package_level_dataframe(path_json_dataframe)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "path object\n",
+ "score int64\n",
+ "has_declared_license_in_key_files bool\n",
+ "file_level_license_and_copyright_coverage float64\n",
+ "has_consistent_key_and_file_level_licenses bool\n",
+ "is_using_only_spdx_licenses bool\n",
+ "has_full_text_for_all_licenses bool\n",
+ "TimeIndex datetime64[ns, UTC]\n",
+ "dtype: object"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "metadata_dataframe.dtypes"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "TimeIndex datetime64[ns, UTC]\n",
+ "Files object\n",
+ "dtype: object"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "files_dataframe.dtypes"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "list"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "type(path_json_dataframe['json_content'][0]['content']['files'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'path': 'LICENSE',\n",
+ " 'type': 'file',\n",
+ " 'name': 'LICENSE',\n",
+ " 'base_name': 'LICENSE',\n",
+ " 'extension': '',\n",
+ " 'size': 1070,\n",
+ " 'date': '2019-04-17',\n",
+ " 'sha1': 'c31f1358e9e15586333a3a91298e770a9d360867',\n",
+ " 'md5': '2f0943b127960f438881e0550ddf9fa1',\n",
+ " 'mime_type': 'text/plain',\n",
+ " 'file_type': 'ASCII text',\n",
+ " 'programming_language': None,\n",
+ " 'is_binary': False,\n",
+ " 'is_text': True,\n",
+ " 'is_archive': False,\n",
+ " 'is_media': False,\n",
+ " 'is_source': False,\n",
+ " 'is_script': False,\n",
+ " 'licenses': [{'key': 'mit',\n",
+ " 'score': 99.4,\n",
+ " 'name': 'MIT License',\n",
+ " 'short_name': 'MIT License',\n",
+ " 'category': 'Permissive',\n",
+ " 'is_exception': False,\n",
+ " 'owner': 'MIT',\n",
+ " 'homepage_url': 'http://opensource.org/licenses/mit-license.php',\n",
+ " 'text_url': 'http://opensource.org/licenses/mit-license.php',\n",
+ " 'reference_url': 'https://enterprise.dejacode.com/urn/urn:dje:license:mit',\n",
+ " 'spdx_license_key': 'MIT',\n",
+ " 'spdx_url': 'https://spdx.org/licenses/MIT',\n",
+ " 'start_line': 1,\n",
+ " 'end_line': 21,\n",
+ " 'matched_rule': {'identifier': 'mit_160.RULE',\n",
+ " 'license_expression': 'mit',\n",
+ " 'licenses': ['mit'],\n",
+ " 'is_license_text': True,\n",
+ " 'is_license_notice': False,\n",
+ " 'is_license_reference': False,\n",
+ " 'is_license_tag': False,\n",
+ " 'matcher': '3-seq',\n",
+ " 'rule_length': 167,\n",
+ " 'matched_length': 166,\n",
+ " 'match_coverage': 99.4,\n",
+ " 'rule_relevance': 100},\n",
+ " 'matched_text': 'MIT License\\n\\nCopyright ([c]) [2018] [Hector] [Zarco] \\n\\nPermission is hereby granted, free of charge, to any person obtaining a copy\\nof this software and associated documentation files (the \"Software\"), to deal\\nin the Software without restriction, including without limitation the rights\\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\\ncopies of the Software, and to permit persons to whom the Software is\\nfurnished to do so, subject to the following conditions:\\n\\nThe above copyright notice and this permission notice shall be included in all\\ncopies or substantial portions of the Software.\\n\\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\\nSOFTWARE.'}],\n",
+ " 'license_expressions': ['mit'],\n",
+ " 'holders': [{'value': 'Hector Zarco', 'start_line': 3, 'end_line': 3}],\n",
+ " 'copyrights': [{'value': 'Copyright (c) 2018 Hector Zarco',\n",
+ " 'start_line': 3,\n",
+ " 'end_line': 3}],\n",
+ " 'authors': [],\n",
+ " 'packages': [],\n",
+ " 'emails': [],\n",
+ " 'urls': [],\n",
+ " 'is_legal': True,\n",
+ " 'is_manifest': False,\n",
+ " 'is_readme': False,\n",
+ " 'is_top_level': True,\n",
+ " 'is_key_file': True,\n",
+ " 'is_generated': False,\n",
+ " 'is_license_text': True,\n",
+ " 'files_count': 0,\n",
+ " 'dirs_count': 0,\n",
+ " 'size_count': 0,\n",
+ " 'scan_errors': []}"
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "files_dataframe['Files'][0][2]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pkg_class.results_file"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "file_level_dataframes_list = []\n",
+ "for package_scan_result in files_dataframe.itertuples():\n",
+ " file_level_dataframe = pkg_class.results_file.create_file_level_dataframe(package_scan_result[2])\n",
+ " file_level_dataframes_list.append(file_level_dataframe)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Columns of File level DataFrame"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "path object\n",
+ "size int64\n",
+ "mime_type object\n",
+ "file_type object\n",
+ "programming_language object\n",
+ "is_binary bool\n",
+ "is_text bool\n",
+ "is_archive bool\n",
+ "is_media bool\n",
+ "is_source bool\n",
+ "is_script bool\n",
+ "licenses object\n",
+ "is_legal bool\n",
+ "is_manifest bool\n",
+ "is_readme bool\n",
+ "is_top_level bool\n",
+ "is_key_file bool\n",
+ "is_generated bool\n",
+ "is_license_text_file bool\n",
+ "license_detections_no int64\n",
+ "key object\n",
+ "score float64\n",
+ "category object\n",
+ "is_exception bool\n",
+ "start_line int64\n",
+ "end_line int64\n",
+ "matched_text object\n",
+ "identifier object\n",
+ "is_license_text_lic bool\n",
+ "is_license_notice bool\n",
+ "is_license_reference bool\n",
+ "is_license_tag bool\n",
+ "matcher object\n",
+ "rule_length int64\n",
+ "matched_length int64\n",
+ "match_coverage float64\n",
+ "rule_relevance int64\n",
+ "dtype: object"
+ ]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "file_level_dataframe.dtypes"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " | \n",
+ " path | \n",
+ " size | \n",
+ " mime_type | \n",
+ " file_type | \n",
+ " programming_language | \n",
+ " is_binary | \n",
+ " is_text | \n",
+ " is_archive | \n",
+ " is_media | \n",
+ " is_source | \n",
+ " ... | \n",
+ " identifier | \n",
+ " is_license_text_lic | \n",
+ " is_license_notice | \n",
+ " is_license_reference | \n",
+ " is_license_tag | \n",
+ " matcher | \n",
+ " rule_length | \n",
+ " matched_length | \n",
+ " match_coverage | \n",
+ " rule_relevance | \n",
+ "
\n",
+ " \n",
+ " sha1 | \n",
+ " lic_det_num | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " b052b1693074fa80a08f8b97e7013b4ffa903ea2 | \n",
+ " 0 | \n",
+ " package/LICENSE | \n",
+ " 1097 | \n",
+ " text/plain | \n",
+ " ASCII text | \n",
+ " None | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " ... | \n",
+ " mit_26.RULE | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " 3-seq | \n",
+ " 167 | \n",
+ " 166 | \n",
+ " 99.4 | \n",
+ " 100 | \n",
+ "
\n",
+ " \n",
+ " ba9f5f050b26533835c3d9c95d6ee020ffba40fc | \n",
+ " 0 | \n",
+ " package/package.json | \n",
+ " 1376 | \n",
+ " text/plain | \n",
+ " ASCII text | \n",
+ " None | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " ... | \n",
+ " mit_34.RULE | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " 2-aho | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 100.0 | \n",
+ " 99 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
2 rows × 37 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " path \\\n",
+ "sha1 lic_det_num \n",
+ "b052b1693074fa80a08f8b97e7013b4ffa903ea2 0 package/LICENSE \n",
+ "ba9f5f050b26533835c3d9c95d6ee020ffba40fc 0 package/package.json \n",
+ "\n",
+ " size mime_type \\\n",
+ "sha1 lic_det_num \n",
+ "b052b1693074fa80a08f8b97e7013b4ffa903ea2 0 1097 text/plain \n",
+ "ba9f5f050b26533835c3d9c95d6ee020ffba40fc 0 1376 text/plain \n",
+ "\n",
+ " file_type \\\n",
+ "sha1 lic_det_num \n",
+ "b052b1693074fa80a08f8b97e7013b4ffa903ea2 0 ASCII text \n",
+ "ba9f5f050b26533835c3d9c95d6ee020ffba40fc 0 ASCII text \n",
+ "\n",
+ " programming_language \\\n",
+ "sha1 lic_det_num \n",
+ "b052b1693074fa80a08f8b97e7013b4ffa903ea2 0 None \n",
+ "ba9f5f050b26533835c3d9c95d6ee020ffba40fc 0 None \n",
+ "\n",
+ " is_binary is_text \\\n",
+ "sha1 lic_det_num \n",
+ "b052b1693074fa80a08f8b97e7013b4ffa903ea2 0 False True \n",
+ "ba9f5f050b26533835c3d9c95d6ee020ffba40fc 0 False True \n",
+ "\n",
+ " is_archive is_media \\\n",
+ "sha1 lic_det_num \n",
+ "b052b1693074fa80a08f8b97e7013b4ffa903ea2 0 False False \n",
+ "ba9f5f050b26533835c3d9c95d6ee020ffba40fc 0 False False \n",
+ "\n",
+ " is_source ... \\\n",
+ "sha1 lic_det_num ... \n",
+ "b052b1693074fa80a08f8b97e7013b4ffa903ea2 0 False ... \n",
+ "ba9f5f050b26533835c3d9c95d6ee020ffba40fc 0 False ... \n",
+ "\n",
+ " identifier \\\n",
+ "sha1 lic_det_num \n",
+ "b052b1693074fa80a08f8b97e7013b4ffa903ea2 0 mit_26.RULE \n",
+ "ba9f5f050b26533835c3d9c95d6ee020ffba40fc 0 mit_34.RULE \n",
+ "\n",
+ " is_license_text_lic \\\n",
+ "sha1 lic_det_num \n",
+ "b052b1693074fa80a08f8b97e7013b4ffa903ea2 0 True \n",
+ "ba9f5f050b26533835c3d9c95d6ee020ffba40fc 0 False \n",
+ "\n",
+ " is_license_notice \\\n",
+ "sha1 lic_det_num \n",
+ "b052b1693074fa80a08f8b97e7013b4ffa903ea2 0 False \n",
+ "ba9f5f050b26533835c3d9c95d6ee020ffba40fc 0 False \n",
+ "\n",
+ " is_license_reference \\\n",
+ "sha1 lic_det_num \n",
+ "b052b1693074fa80a08f8b97e7013b4ffa903ea2 0 False \n",
+ "ba9f5f050b26533835c3d9c95d6ee020ffba40fc 0 False \n",
+ "\n",
+ " is_license_tag matcher \\\n",
+ "sha1 lic_det_num \n",
+ "b052b1693074fa80a08f8b97e7013b4ffa903ea2 0 False 3-seq \n",
+ "ba9f5f050b26533835c3d9c95d6ee020ffba40fc 0 True 2-aho \n",
+ "\n",
+ " rule_length \\\n",
+ "sha1 lic_det_num \n",
+ "b052b1693074fa80a08f8b97e7013b4ffa903ea2 0 167 \n",
+ "ba9f5f050b26533835c3d9c95d6ee020ffba40fc 0 2 \n",
+ "\n",
+ " matched_length \\\n",
+ "sha1 lic_det_num \n",
+ "b052b1693074fa80a08f8b97e7013b4ffa903ea2 0 166 \n",
+ "ba9f5f050b26533835c3d9c95d6ee020ffba40fc 0 2 \n",
+ "\n",
+ " match_coverage \\\n",
+ "sha1 lic_det_num \n",
+ "b052b1693074fa80a08f8b97e7013b4ffa903ea2 0 99.4 \n",
+ "ba9f5f050b26533835c3d9c95d6ee020ffba40fc 0 100.0 \n",
+ "\n",
+ " rule_relevance \n",
+ "sha1 lic_det_num \n",
+ "b052b1693074fa80a08f8b97e7013b4ffa903ea2 0 100 \n",
+ "ba9f5f050b26533835c3d9c95d6ee020ffba40fc 0 99 \n",
+ "\n",
+ "[2 rows x 37 columns]"
+ ]
+ },
+ "execution_count": 17,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "file_level_dataframe.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "list_file_level_keys = list(files_dataframe['TimeIndex'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[Timestamp('2019-04-17 14:19:10.672000+0000', tz='UTC'),\n",
+ " Timestamp('2020-05-26 21:20:17.862000+0000', tz='UTC'),\n",
+ " Timestamp('2019-10-10 21:19:38.969000+0000', tz='UTC'),\n",
+ " Timestamp('2020-06-02 13:30:17.825000+0000', tz='UTC'),\n",
+ " Timestamp('2020-06-01 23:24:15.483000+0000', tz='UTC'),\n",
+ " Timestamp('2020-06-02 00:06:35.792000+0000', tz='UTC'),\n",
+ " Timestamp('2019-09-23 12:12:28.640000+0000', tz='UTC'),\n",
+ " Timestamp('2020-05-25 23:05:57.708000+0000', tz='UTC'),\n",
+ " Timestamp('2019-09-26 01:02:47.736000+0000', tz='UTC'),\n",
+ " Timestamp('2020-06-02 00:48:46.874000+0000', tz='UTC'),\n",
+ " Timestamp('2019-09-23 18:09:05.285000+0000', tz='UTC'),\n",
+ " Timestamp('2019-05-08 08:53:53.642000+0000', tz='UTC'),\n",
+ " Timestamp('2019-05-01 09:44:02.351000+0000', tz='UTC'),\n",
+ " Timestamp('2020-06-02 12:51:26.231000+0000', tz='UTC'),\n",
+ " Timestamp('2020-06-01 18:54:29.146000+0000', tz='UTC'),\n",
+ " Timestamp('2020-06-02 06:58:36.733000+0000', tz='UTC'),\n",
+ " Timestamp('2020-05-30 10:06:37.472000+0000', tz='UTC'),\n",
+ " Timestamp('2019-05-02 14:20:09.135000+0000', tz='UTC'),\n",
+ " Timestamp('2019-04-25 03:04:31.391000+0000', tz='UTC'),\n",
+ " Timestamp('2020-05-31 23:04:03.965000+0000', tz='UTC')]"
+ ]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "list_file_level_keys"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "main_dataframe = pd.concat(file_level_dataframes_list,keys=list_file_level_keys)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Notice how under one package, there can be many files, and under one file, there can be many license rows, and there are 3 Primary key columns on the left, where there is a one-to-many relationship from left to right."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " path | \n",
+ " size | \n",
+ " mime_type | \n",
+ " file_type | \n",
+ " programming_language | \n",
+ " is_binary | \n",
+ " is_text | \n",
+ " is_archive | \n",
+ " is_media | \n",
+ " is_source | \n",
+ " ... | \n",
+ " identifier | \n",
+ " is_license_text_lic | \n",
+ " is_license_notice | \n",
+ " is_license_reference | \n",
+ " is_license_tag | \n",
+ " matcher | \n",
+ " rule_length | \n",
+ " matched_length | \n",
+ " match_coverage | \n",
+ " rule_relevance | \n",
+ "
\n",
+ " \n",
+ " | \n",
+ " sha1 | \n",
+ " lic_det_num | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 2019-04-17 14:19:10.672000+00:00 | \n",
+ " c31f1358e9e15586333a3a91298e770a9d360867 | \n",
+ " 0 | \n",
+ " LICENSE | \n",
+ " 1070 | \n",
+ " text/plain | \n",
+ " ASCII text | \n",
+ " None | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " ... | \n",
+ " mit_160.RULE | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " 3-seq | \n",
+ " 167 | \n",
+ " 166 | \n",
+ " 99.40 | \n",
+ " 100 | \n",
+ "
\n",
+ " \n",
+ " e5dad7813f1edf787936eedfa92c421bf28080c3 | \n",
+ " 0 | \n",
+ " package.json | \n",
+ " 1102 | \n",
+ " text/plain | \n",
+ " ASCII text | \n",
+ " None | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " ... | \n",
+ " mit_34.RULE | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " 2-aho | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 100.00 | \n",
+ " 99 | \n",
+ "
\n",
+ " \n",
+ " 2020-05-26 21:20:17.862000+00:00 | \n",
+ " 15ecee2af9d79cc7c86d42b31b8778faf61e7e35 | \n",
+ " 0 | \n",
+ " ccxt-1.28.76/README.rst | \n",
+ " 104132 | \n",
+ " text/plain | \n",
+ " UTF-8 Unicode text, with very long lines | \n",
+ " None | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " ... | \n",
+ " mit_77.RULE | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " 2-aho | \n",
+ " 4 | \n",
+ " 4 | \n",
+ " 100.00 | \n",
+ " 22 | \n",
+ "
\n",
+ " \n",
+ " 1c3a8a412cca20678559fce34b46686c39a835c0 | \n",
+ " 0 | \n",
+ " ccxt-1.28.76/ccxt.egg-info/PKG-INFO | \n",
+ " 115136 | \n",
+ " text/plain | \n",
+ " UTF-8 Unicode text, with very long lines | \n",
+ " None | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " ... | \n",
+ " mit_34.RULE | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " 2-aho | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 100.00 | \n",
+ " 99 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " ccxt-1.28.76/ccxt.egg-info/PKG-INFO | \n",
+ " 115136 | \n",
+ " text/plain | \n",
+ " UTF-8 Unicode text, with very long lines | \n",
+ " None | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " ... | \n",
+ " mit_77.RULE | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " 2-aho | \n",
+ " 4 | \n",
+ " 4 | \n",
+ " 100.00 | \n",
+ " 22 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " ccxt-1.28.76/ccxt.egg-info/PKG-INFO | \n",
+ " 115136 | \n",
+ " text/plain | \n",
+ " UTF-8 Unicode text, with very long lines | \n",
+ " None | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " ... | \n",
+ " pypi_mit_license.RULE | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " 2-aho | \n",
+ " 5 | \n",
+ " 5 | \n",
+ " 100.00 | \n",
+ " 27 | \n",
+ "
\n",
+ " \n",
+ " 3be892ba5a4cdc550ac746e257e8e944e79561de | \n",
+ " 0 | \n",
+ " ccxt-1.28.76/LICENSE.txt | \n",
+ " 1068 | \n",
+ " text/plain | \n",
+ " UTF-8 Unicode text | \n",
+ " None | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " ... | \n",
+ " mit_160.RULE | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " 3-seq | \n",
+ " 167 | \n",
+ " 166 | \n",
+ " 99.40 | \n",
+ " 100 | \n",
+ "
\n",
+ " \n",
+ " 3d5143b5fffba7880c3c2d10d73ae39740dbdd2c | \n",
+ " 0 | \n",
+ " ccxt-1.28.76/ccxt/static_dependencies/ecdsa/nu... | \n",
+ " 12535 | \n",
+ " text/x-python | \n",
+ " Python script, ASCII text executable | \n",
+ " Python | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " ... | \n",
+ " public-domain_45.RULE | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " 2-aho | \n",
+ " 5 | \n",
+ " 5 | \n",
+ " 100.00 | \n",
+ " 27 | \n",
+ "
\n",
+ " \n",
+ " 4828905332706cdfebe3dfe9f699bb6616abfd76 | \n",
+ " 0 | \n",
+ " ccxt-1.28.76/ccxt/static_dependencies/ecdsa/ec... | \n",
+ " 10957 | \n",
+ " text/x-python | \n",
+ " Python script, ASCII text executable | \n",
+ " Python | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " ... | \n",
+ " public-domain_45.RULE | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " 2-aho | \n",
+ " 5 | \n",
+ " 5 | \n",
+ " 100.00 | \n",
+ " 27 | \n",
+ "
\n",
+ " \n",
+ " 7e75216e98490df3995164be00d5d6d4ae8d63cf | \n",
+ " 0 | \n",
+ " ccxt-1.28.76/setup.py | \n",
+ " 2619 | \n",
+ " text/x-python | \n",
+ " Python script, ASCII text executable | \n",
+ " Python | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " ... | \n",
+ " pypi_mit_license.RULE | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " 2-aho | \n",
+ " 5 | \n",
+ " 5 | \n",
+ " 100.00 | \n",
+ " 27 | \n",
+ "
\n",
+ " \n",
+ " 9d5dc47d7b122dbfb7005e627a352ca001dddb73 | \n",
+ " 0 | \n",
+ " ccxt-1.28.76/ccxt/base/__init__.py | \n",
+ " 1320 | \n",
+ " text/x-python | \n",
+ " Python script, ASCII text executable | \n",
+ " Python | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " ... | \n",
+ " mit_160.RULE | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " 3-seq | \n",
+ " 167 | \n",
+ " 166 | \n",
+ " 99.40 | \n",
+ " 100 | \n",
+ "
\n",
+ " \n",
+ " a5fddef68a032cd03b957cca5076c2244480a34b | \n",
+ " 0 | \n",
+ " ccxt-1.28.76/ccxt/static_dependencies/ecdsa/_v... | \n",
+ " 18461 | \n",
+ " text/x-c++ | \n",
+ " C++ source, ASCII text | \n",
+ " Python | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " ... | \n",
+ " public-domain_15.RULE | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " 2-aho | \n",
+ " 5 | \n",
+ " 5 | \n",
+ " 100.00 | \n",
+ " 100 | \n",
+ "
\n",
+ " \n",
+ " a6d104bee2f3a7610bb10ad379bfb6b0d3466f89 | \n",
+ " 0 | \n",
+ " ccxt-1.28.76/package.json | \n",
+ " 12024 | \n",
+ " text/plain | \n",
+ " ASCII text, with very long lines | \n",
+ " None | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " ... | \n",
+ " mit_34.RULE | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " 2-aho | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 100.00 | \n",
+ " 99 | \n",
+ "
\n",
+ " \n",
+ " abcd264d3854a8b5cc987f265fb581b8c59ebe50 | \n",
+ " 0 | \n",
+ " ccxt-1.28.76/ccxt/static_dependencies/ecdsa/el... | \n",
+ " 5095 | \n",
+ " text/x-python | \n",
+ " Python script, ASCII text executable | \n",
+ " Python | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " ... | \n",
+ " public-domain_45.RULE | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " 2-aho | \n",
+ " 5 | \n",
+ " 5 | \n",
+ " 100.00 | \n",
+ " 27 | \n",
+ "
\n",
+ " \n",
+ " da97ffed59276dce11663a7ddb468112811debeb | \n",
+ " 0 | \n",
+ " ccxt-1.28.76/ccxt/__init__.py | \n",
+ " 15038 | \n",
+ " text/x-python | \n",
+ " Python script, ASCII text executable | \n",
+ " Python | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " ... | \n",
+ " mit_160.RULE | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " 3-seq | \n",
+ " 167 | \n",
+ " 166 | \n",
+ " 99.40 | \n",
+ " 100 | \n",
+ "
\n",
+ " \n",
+ " 2019-10-10 21:19:38.969000+00:00 | \n",
+ " cdff21aa9a0f6aa5341e1c971146d1fc30febd1f | \n",
+ " 0 | \n",
+ " motor-cms-motor-core-03e662e/composer.json | \n",
+ " 1071 | \n",
+ " text/plain | \n",
+ " ASCII text | \n",
+ " None | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " ... | \n",
+ " mit_34.RULE | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " 2-aho | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 100.00 | \n",
+ " 99 | \n",
+ "
\n",
+ " \n",
+ " 2020-06-02 13:30:17.825000+00:00 | \n",
+ " 10117bb7a0b195dc7b6aeaae78be69c10fa7d620 | \n",
+ " 0 | \n",
+ " lucatume-wp-browser-b58f4dd/src/data/plugins/w... | \n",
+ " 4564 | \n",
+ " text/plain | \n",
+ " ASCII text, with very long lines | \n",
+ " Python | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " ... | \n",
+ " gpl-2.0_23.RULE | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " 2-aho | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 100.00 | \n",
+ " 100 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " lucatume-wp-browser-b58f4dd/src/data/plugins/w... | \n",
+ " 4564 | \n",
+ " text/plain | \n",
+ " ASCII text, with very long lines | \n",
+ " Python | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " ... | \n",
+ " gpl-2.0-plus_310.RULE | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " 3-seq | \n",
+ " 12 | \n",
+ " 11 | \n",
+ " 91.67 | \n",
+ " 66 | \n",
+ "
\n",
+ " \n",
+ " 254d7d285416df46733e9541e4da21fc498d3502 | \n",
+ " 0 | \n",
+ " lucatume-wp-browser-b58f4dd/composer.json | \n",
+ " 2617 | \n",
+ " text/plain | \n",
+ " ASCII text | \n",
+ " None | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " ... | \n",
+ " mit_34.RULE | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " 2-aho | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 100.00 | \n",
+ " 99 | \n",
+ "
\n",
+ " \n",
+ " 4b6af7167d38d0751614ad390908dc4a8e0ef652 | \n",
+ " 0 | \n",
+ " lucatume-wp-browser-b58f4dd/src/data/plugins/w... | \n",
+ " 44659 | \n",
+ " text/x-php | \n",
+ " PHP script, ASCII text | \n",
+ " PHP | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " ... | \n",
+ " gpl-2.0_550.RULE | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " 2-aho | \n",
+ " 4 | \n",
+ " 4 | \n",
+ " 100.00 | \n",
+ " 100 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
20 rows × 37 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " path \\\n",
+ " sha1 lic_det_num \n",
+ "2019-04-17 14:19:10.672000+00:00 c31f1358e9e15586333a3a91298e770a9d360867 0 LICENSE \n",
+ " e5dad7813f1edf787936eedfa92c421bf28080c3 0 package.json \n",
+ "2020-05-26 21:20:17.862000+00:00 15ecee2af9d79cc7c86d42b31b8778faf61e7e35 0 ccxt-1.28.76/README.rst \n",
+ " 1c3a8a412cca20678559fce34b46686c39a835c0 0 ccxt-1.28.76/ccxt.egg-info/PKG-INFO \n",
+ " 1 ccxt-1.28.76/ccxt.egg-info/PKG-INFO \n",
+ " 2 ccxt-1.28.76/ccxt.egg-info/PKG-INFO \n",
+ " 3be892ba5a4cdc550ac746e257e8e944e79561de 0 ccxt-1.28.76/LICENSE.txt \n",
+ " 3d5143b5fffba7880c3c2d10d73ae39740dbdd2c 0 ccxt-1.28.76/ccxt/static_dependencies/ecdsa/nu... \n",
+ " 4828905332706cdfebe3dfe9f699bb6616abfd76 0 ccxt-1.28.76/ccxt/static_dependencies/ecdsa/ec... \n",
+ " 7e75216e98490df3995164be00d5d6d4ae8d63cf 0 ccxt-1.28.76/setup.py \n",
+ " 9d5dc47d7b122dbfb7005e627a352ca001dddb73 0 ccxt-1.28.76/ccxt/base/__init__.py \n",
+ " a5fddef68a032cd03b957cca5076c2244480a34b 0 ccxt-1.28.76/ccxt/static_dependencies/ecdsa/_v... \n",
+ " a6d104bee2f3a7610bb10ad379bfb6b0d3466f89 0 ccxt-1.28.76/package.json \n",
+ " abcd264d3854a8b5cc987f265fb581b8c59ebe50 0 ccxt-1.28.76/ccxt/static_dependencies/ecdsa/el... \n",
+ " da97ffed59276dce11663a7ddb468112811debeb 0 ccxt-1.28.76/ccxt/__init__.py \n",
+ "2019-10-10 21:19:38.969000+00:00 cdff21aa9a0f6aa5341e1c971146d1fc30febd1f 0 motor-cms-motor-core-03e662e/composer.json \n",
+ "2020-06-02 13:30:17.825000+00:00 10117bb7a0b195dc7b6aeaae78be69c10fa7d620 0 lucatume-wp-browser-b58f4dd/src/data/plugins/w... \n",
+ " 1 lucatume-wp-browser-b58f4dd/src/data/plugins/w... \n",
+ " 254d7d285416df46733e9541e4da21fc498d3502 0 lucatume-wp-browser-b58f4dd/composer.json \n",
+ " 4b6af7167d38d0751614ad390908dc4a8e0ef652 0 lucatume-wp-browser-b58f4dd/src/data/plugins/w... \n",
+ "\n",
+ " size \\\n",
+ " sha1 lic_det_num \n",
+ "2019-04-17 14:19:10.672000+00:00 c31f1358e9e15586333a3a91298e770a9d360867 0 1070 \n",
+ " e5dad7813f1edf787936eedfa92c421bf28080c3 0 1102 \n",
+ "2020-05-26 21:20:17.862000+00:00 15ecee2af9d79cc7c86d42b31b8778faf61e7e35 0 104132 \n",
+ " 1c3a8a412cca20678559fce34b46686c39a835c0 0 115136 \n",
+ " 1 115136 \n",
+ " 2 115136 \n",
+ " 3be892ba5a4cdc550ac746e257e8e944e79561de 0 1068 \n",
+ " 3d5143b5fffba7880c3c2d10d73ae39740dbdd2c 0 12535 \n",
+ " 4828905332706cdfebe3dfe9f699bb6616abfd76 0 10957 \n",
+ " 7e75216e98490df3995164be00d5d6d4ae8d63cf 0 2619 \n",
+ " 9d5dc47d7b122dbfb7005e627a352ca001dddb73 0 1320 \n",
+ " a5fddef68a032cd03b957cca5076c2244480a34b 0 18461 \n",
+ " a6d104bee2f3a7610bb10ad379bfb6b0d3466f89 0 12024 \n",
+ " abcd264d3854a8b5cc987f265fb581b8c59ebe50 0 5095 \n",
+ " da97ffed59276dce11663a7ddb468112811debeb 0 15038 \n",
+ "2019-10-10 21:19:38.969000+00:00 cdff21aa9a0f6aa5341e1c971146d1fc30febd1f 0 1071 \n",
+ "2020-06-02 13:30:17.825000+00:00 10117bb7a0b195dc7b6aeaae78be69c10fa7d620 0 4564 \n",
+ " 1 4564 \n",
+ " 254d7d285416df46733e9541e4da21fc498d3502 0 2617 \n",
+ " 4b6af7167d38d0751614ad390908dc4a8e0ef652 0 44659 \n",
+ "\n",
+ " mime_type \\\n",
+ " sha1 lic_det_num \n",
+ "2019-04-17 14:19:10.672000+00:00 c31f1358e9e15586333a3a91298e770a9d360867 0 text/plain \n",
+ " e5dad7813f1edf787936eedfa92c421bf28080c3 0 text/plain \n",
+ "2020-05-26 21:20:17.862000+00:00 15ecee2af9d79cc7c86d42b31b8778faf61e7e35 0 text/plain \n",
+ " 1c3a8a412cca20678559fce34b46686c39a835c0 0 text/plain \n",
+ " 1 text/plain \n",
+ " 2 text/plain \n",
+ " 3be892ba5a4cdc550ac746e257e8e944e79561de 0 text/plain \n",
+ " 3d5143b5fffba7880c3c2d10d73ae39740dbdd2c 0 text/x-python \n",
+ " 4828905332706cdfebe3dfe9f699bb6616abfd76 0 text/x-python \n",
+ " 7e75216e98490df3995164be00d5d6d4ae8d63cf 0 text/x-python \n",
+ " 9d5dc47d7b122dbfb7005e627a352ca001dddb73 0 text/x-python \n",
+ " a5fddef68a032cd03b957cca5076c2244480a34b 0 text/x-c++ \n",
+ " a6d104bee2f3a7610bb10ad379bfb6b0d3466f89 0 text/plain \n",
+ " abcd264d3854a8b5cc987f265fb581b8c59ebe50 0 text/x-python \n",
+ " da97ffed59276dce11663a7ddb468112811debeb 0 text/x-python \n",
+ "2019-10-10 21:19:38.969000+00:00 cdff21aa9a0f6aa5341e1c971146d1fc30febd1f 0 text/plain \n",
+ "2020-06-02 13:30:17.825000+00:00 10117bb7a0b195dc7b6aeaae78be69c10fa7d620 0 text/plain \n",
+ " 1 text/plain \n",
+ " 254d7d285416df46733e9541e4da21fc498d3502 0 text/plain \n",
+ " 4b6af7167d38d0751614ad390908dc4a8e0ef652 0 text/x-php \n",
+ "\n",
+ " file_type \\\n",
+ " sha1 lic_det_num \n",
+ "2019-04-17 14:19:10.672000+00:00 c31f1358e9e15586333a3a91298e770a9d360867 0 ASCII text \n",
+ " e5dad7813f1edf787936eedfa92c421bf28080c3 0 ASCII text \n",
+ "2020-05-26 21:20:17.862000+00:00 15ecee2af9d79cc7c86d42b31b8778faf61e7e35 0 UTF-8 Unicode text, with very long lines \n",
+ " 1c3a8a412cca20678559fce34b46686c39a835c0 0 UTF-8 Unicode text, with very long lines \n",
+ " 1 UTF-8 Unicode text, with very long lines \n",
+ " 2 UTF-8 Unicode text, with very long lines \n",
+ " 3be892ba5a4cdc550ac746e257e8e944e79561de 0 UTF-8 Unicode text \n",
+ " 3d5143b5fffba7880c3c2d10d73ae39740dbdd2c 0 Python script, ASCII text executable \n",
+ " 4828905332706cdfebe3dfe9f699bb6616abfd76 0 Python script, ASCII text executable \n",
+ " 7e75216e98490df3995164be00d5d6d4ae8d63cf 0 Python script, ASCII text executable \n",
+ " 9d5dc47d7b122dbfb7005e627a352ca001dddb73 0 Python script, ASCII text executable \n",
+ " a5fddef68a032cd03b957cca5076c2244480a34b 0 C++ source, ASCII text \n",
+ " a6d104bee2f3a7610bb10ad379bfb6b0d3466f89 0 ASCII text, with very long lines \n",
+ " abcd264d3854a8b5cc987f265fb581b8c59ebe50 0 Python script, ASCII text executable \n",
+ " da97ffed59276dce11663a7ddb468112811debeb 0 Python script, ASCII text executable \n",
+ "2019-10-10 21:19:38.969000+00:00 cdff21aa9a0f6aa5341e1c971146d1fc30febd1f 0 ASCII text \n",
+ "2020-06-02 13:30:17.825000+00:00 10117bb7a0b195dc7b6aeaae78be69c10fa7d620 0 ASCII text, with very long lines \n",
+ " 1 ASCII text, with very long lines \n",
+ " 254d7d285416df46733e9541e4da21fc498d3502 0 ASCII text \n",
+ " 4b6af7167d38d0751614ad390908dc4a8e0ef652 0 PHP script, ASCII text \n",
+ "\n",
+ " programming_language \\\n",
+ " sha1 lic_det_num \n",
+ "2019-04-17 14:19:10.672000+00:00 c31f1358e9e15586333a3a91298e770a9d360867 0 None \n",
+ " e5dad7813f1edf787936eedfa92c421bf28080c3 0 None \n",
+ "2020-05-26 21:20:17.862000+00:00 15ecee2af9d79cc7c86d42b31b8778faf61e7e35 0 None \n",
+ " 1c3a8a412cca20678559fce34b46686c39a835c0 0 None \n",
+ " 1 None \n",
+ " 2 None \n",
+ " 3be892ba5a4cdc550ac746e257e8e944e79561de 0 None \n",
+ " 3d5143b5fffba7880c3c2d10d73ae39740dbdd2c 0 Python \n",
+ " 4828905332706cdfebe3dfe9f699bb6616abfd76 0 Python \n",
+ " 7e75216e98490df3995164be00d5d6d4ae8d63cf 0 Python \n",
+ " 9d5dc47d7b122dbfb7005e627a352ca001dddb73 0 Python \n",
+ " a5fddef68a032cd03b957cca5076c2244480a34b 0 Python \n",
+ " a6d104bee2f3a7610bb10ad379bfb6b0d3466f89 0 None \n",
+ " abcd264d3854a8b5cc987f265fb581b8c59ebe50 0 Python \n",
+ " da97ffed59276dce11663a7ddb468112811debeb 0 Python \n",
+ "2019-10-10 21:19:38.969000+00:00 cdff21aa9a0f6aa5341e1c971146d1fc30febd1f 0 None \n",
+ "2020-06-02 13:30:17.825000+00:00 10117bb7a0b195dc7b6aeaae78be69c10fa7d620 0 Python \n",
+ " 1 Python \n",
+ " 254d7d285416df46733e9541e4da21fc498d3502 0 None \n",
+ " 4b6af7167d38d0751614ad390908dc4a8e0ef652 0 PHP \n",
+ "\n",
+ " is_binary \\\n",
+ " sha1 lic_det_num \n",
+ "2019-04-17 14:19:10.672000+00:00 c31f1358e9e15586333a3a91298e770a9d360867 0 False \n",
+ " e5dad7813f1edf787936eedfa92c421bf28080c3 0 False \n",
+ "2020-05-26 21:20:17.862000+00:00 15ecee2af9d79cc7c86d42b31b8778faf61e7e35 0 False \n",
+ " 1c3a8a412cca20678559fce34b46686c39a835c0 0 False \n",
+ " 1 False \n",
+ " 2 False \n",
+ " 3be892ba5a4cdc550ac746e257e8e944e79561de 0 False \n",
+ " 3d5143b5fffba7880c3c2d10d73ae39740dbdd2c 0 False \n",
+ " 4828905332706cdfebe3dfe9f699bb6616abfd76 0 False \n",
+ " 7e75216e98490df3995164be00d5d6d4ae8d63cf 0 False \n",
+ " 9d5dc47d7b122dbfb7005e627a352ca001dddb73 0 False \n",
+ " a5fddef68a032cd03b957cca5076c2244480a34b 0 False \n",
+ " a6d104bee2f3a7610bb10ad379bfb6b0d3466f89 0 False \n",
+ " abcd264d3854a8b5cc987f265fb581b8c59ebe50 0 False \n",
+ " da97ffed59276dce11663a7ddb468112811debeb 0 False \n",
+ "2019-10-10 21:19:38.969000+00:00 cdff21aa9a0f6aa5341e1c971146d1fc30febd1f 0 False \n",
+ "2020-06-02 13:30:17.825000+00:00 10117bb7a0b195dc7b6aeaae78be69c10fa7d620 0 False \n",
+ " 1 False \n",
+ " 254d7d285416df46733e9541e4da21fc498d3502 0 False \n",
+ " 4b6af7167d38d0751614ad390908dc4a8e0ef652 0 False \n",
+ "\n",
+ " is_text \\\n",
+ " sha1 lic_det_num \n",
+ "2019-04-17 14:19:10.672000+00:00 c31f1358e9e15586333a3a91298e770a9d360867 0 True \n",
+ " e5dad7813f1edf787936eedfa92c421bf28080c3 0 True \n",
+ "2020-05-26 21:20:17.862000+00:00 15ecee2af9d79cc7c86d42b31b8778faf61e7e35 0 True \n",
+ " 1c3a8a412cca20678559fce34b46686c39a835c0 0 True \n",
+ " 1 True \n",
+ " 2 True \n",
+ " 3be892ba5a4cdc550ac746e257e8e944e79561de 0 True \n",
+ " 3d5143b5fffba7880c3c2d10d73ae39740dbdd2c 0 True \n",
+ " 4828905332706cdfebe3dfe9f699bb6616abfd76 0 True \n",
+ " 7e75216e98490df3995164be00d5d6d4ae8d63cf 0 True \n",
+ " 9d5dc47d7b122dbfb7005e627a352ca001dddb73 0 True \n",
+ " a5fddef68a032cd03b957cca5076c2244480a34b 0 True \n",
+ " a6d104bee2f3a7610bb10ad379bfb6b0d3466f89 0 True \n",
+ " abcd264d3854a8b5cc987f265fb581b8c59ebe50 0 True \n",
+ " da97ffed59276dce11663a7ddb468112811debeb 0 True \n",
+ "2019-10-10 21:19:38.969000+00:00 cdff21aa9a0f6aa5341e1c971146d1fc30febd1f 0 True \n",
+ "2020-06-02 13:30:17.825000+00:00 10117bb7a0b195dc7b6aeaae78be69c10fa7d620 0 True \n",
+ " 1 True \n",
+ " 254d7d285416df46733e9541e4da21fc498d3502 0 True \n",
+ " 4b6af7167d38d0751614ad390908dc4a8e0ef652 0 True \n",
+ "\n",
+ " is_archive \\\n",
+ " sha1 lic_det_num \n",
+ "2019-04-17 14:19:10.672000+00:00 c31f1358e9e15586333a3a91298e770a9d360867 0 False \n",
+ " e5dad7813f1edf787936eedfa92c421bf28080c3 0 False \n",
+ "2020-05-26 21:20:17.862000+00:00 15ecee2af9d79cc7c86d42b31b8778faf61e7e35 0 False \n",
+ " 1c3a8a412cca20678559fce34b46686c39a835c0 0 False \n",
+ " 1 False \n",
+ " 2 False \n",
+ " 3be892ba5a4cdc550ac746e257e8e944e79561de 0 False \n",
+ " 3d5143b5fffba7880c3c2d10d73ae39740dbdd2c 0 False \n",
+ " 4828905332706cdfebe3dfe9f699bb6616abfd76 0 False \n",
+ " 7e75216e98490df3995164be00d5d6d4ae8d63cf 0 False \n",
+ " 9d5dc47d7b122dbfb7005e627a352ca001dddb73 0 False \n",
+ " a5fddef68a032cd03b957cca5076c2244480a34b 0 False \n",
+ " a6d104bee2f3a7610bb10ad379bfb6b0d3466f89 0 False \n",
+ " abcd264d3854a8b5cc987f265fb581b8c59ebe50 0 False \n",
+ " da97ffed59276dce11663a7ddb468112811debeb 0 False \n",
+ "2019-10-10 21:19:38.969000+00:00 cdff21aa9a0f6aa5341e1c971146d1fc30febd1f 0 False \n",
+ "2020-06-02 13:30:17.825000+00:00 10117bb7a0b195dc7b6aeaae78be69c10fa7d620 0 False \n",
+ " 1 False \n",
+ " 254d7d285416df46733e9541e4da21fc498d3502 0 False \n",
+ " 4b6af7167d38d0751614ad390908dc4a8e0ef652 0 False \n",
+ "\n",
+ " is_media \\\n",
+ " sha1 lic_det_num \n",
+ "2019-04-17 14:19:10.672000+00:00 c31f1358e9e15586333a3a91298e770a9d360867 0 False \n",
+ " e5dad7813f1edf787936eedfa92c421bf28080c3 0 False \n",
+ "2020-05-26 21:20:17.862000+00:00 15ecee2af9d79cc7c86d42b31b8778faf61e7e35 0 False \n",
+ " 1c3a8a412cca20678559fce34b46686c39a835c0 0 False \n",
+ " 1 False \n",
+ " 2 False \n",
+ " 3be892ba5a4cdc550ac746e257e8e944e79561de 0 False \n",
+ " 3d5143b5fffba7880c3c2d10d73ae39740dbdd2c 0 False \n",
+ " 4828905332706cdfebe3dfe9f699bb6616abfd76 0 False \n",
+ " 7e75216e98490df3995164be00d5d6d4ae8d63cf 0 False \n",
+ " 9d5dc47d7b122dbfb7005e627a352ca001dddb73 0 False \n",
+ " a5fddef68a032cd03b957cca5076c2244480a34b 0 False \n",
+ " a6d104bee2f3a7610bb10ad379bfb6b0d3466f89 0 False \n",
+ " abcd264d3854a8b5cc987f265fb581b8c59ebe50 0 False \n",
+ " da97ffed59276dce11663a7ddb468112811debeb 0 False \n",
+ "2019-10-10 21:19:38.969000+00:00 cdff21aa9a0f6aa5341e1c971146d1fc30febd1f 0 False \n",
+ "2020-06-02 13:30:17.825000+00:00 10117bb7a0b195dc7b6aeaae78be69c10fa7d620 0 False \n",
+ " 1 False \n",
+ " 254d7d285416df46733e9541e4da21fc498d3502 0 False \n",
+ " 4b6af7167d38d0751614ad390908dc4a8e0ef652 0 False \n",
+ "\n",
+ " is_source \\\n",
+ " sha1 lic_det_num \n",
+ "2019-04-17 14:19:10.672000+00:00 c31f1358e9e15586333a3a91298e770a9d360867 0 False \n",
+ " e5dad7813f1edf787936eedfa92c421bf28080c3 0 False \n",
+ "2020-05-26 21:20:17.862000+00:00 15ecee2af9d79cc7c86d42b31b8778faf61e7e35 0 False \n",
+ " 1c3a8a412cca20678559fce34b46686c39a835c0 0 False \n",
+ " 1 False \n",
+ " 2 False \n",
+ " 3be892ba5a4cdc550ac746e257e8e944e79561de 0 False \n",
+ " 3d5143b5fffba7880c3c2d10d73ae39740dbdd2c 0 True \n",
+ " 4828905332706cdfebe3dfe9f699bb6616abfd76 0 True \n",
+ " 7e75216e98490df3995164be00d5d6d4ae8d63cf 0 True \n",
+ " 9d5dc47d7b122dbfb7005e627a352ca001dddb73 0 True \n",
+ " a5fddef68a032cd03b957cca5076c2244480a34b 0 True \n",
+ " a6d104bee2f3a7610bb10ad379bfb6b0d3466f89 0 False \n",
+ " abcd264d3854a8b5cc987f265fb581b8c59ebe50 0 True \n",
+ " da97ffed59276dce11663a7ddb468112811debeb 0 True \n",
+ "2019-10-10 21:19:38.969000+00:00 cdff21aa9a0f6aa5341e1c971146d1fc30febd1f 0 False \n",
+ "2020-06-02 13:30:17.825000+00:00 10117bb7a0b195dc7b6aeaae78be69c10fa7d620 0 False \n",
+ " 1 False \n",
+ " 254d7d285416df46733e9541e4da21fc498d3502 0 False \n",
+ " 4b6af7167d38d0751614ad390908dc4a8e0ef652 0 True \n",
+ "\n",
+ " ... \\\n",
+ " sha1 lic_det_num ... \n",
+ "2019-04-17 14:19:10.672000+00:00 c31f1358e9e15586333a3a91298e770a9d360867 0 ... \n",
+ " e5dad7813f1edf787936eedfa92c421bf28080c3 0 ... \n",
+ "2020-05-26 21:20:17.862000+00:00 15ecee2af9d79cc7c86d42b31b8778faf61e7e35 0 ... \n",
+ " 1c3a8a412cca20678559fce34b46686c39a835c0 0 ... \n",
+ " 1 ... \n",
+ " 2 ... \n",
+ " 3be892ba5a4cdc550ac746e257e8e944e79561de 0 ... \n",
+ " 3d5143b5fffba7880c3c2d10d73ae39740dbdd2c 0 ... \n",
+ " 4828905332706cdfebe3dfe9f699bb6616abfd76 0 ... \n",
+ " 7e75216e98490df3995164be00d5d6d4ae8d63cf 0 ... \n",
+ " 9d5dc47d7b122dbfb7005e627a352ca001dddb73 0 ... \n",
+ " a5fddef68a032cd03b957cca5076c2244480a34b 0 ... \n",
+ " a6d104bee2f3a7610bb10ad379bfb6b0d3466f89 0 ... \n",
+ " abcd264d3854a8b5cc987f265fb581b8c59ebe50 0 ... \n",
+ " da97ffed59276dce11663a7ddb468112811debeb 0 ... \n",
+ "2019-10-10 21:19:38.969000+00:00 cdff21aa9a0f6aa5341e1c971146d1fc30febd1f 0 ... \n",
+ "2020-06-02 13:30:17.825000+00:00 10117bb7a0b195dc7b6aeaae78be69c10fa7d620 0 ... \n",
+ " 1 ... \n",
+ " 254d7d285416df46733e9541e4da21fc498d3502 0 ... \n",
+ " 4b6af7167d38d0751614ad390908dc4a8e0ef652 0 ... \n",
+ "\n",
+ " identifier \\\n",
+ " sha1 lic_det_num \n",
+ "2019-04-17 14:19:10.672000+00:00 c31f1358e9e15586333a3a91298e770a9d360867 0 mit_160.RULE \n",
+ " e5dad7813f1edf787936eedfa92c421bf28080c3 0 mit_34.RULE \n",
+ "2020-05-26 21:20:17.862000+00:00 15ecee2af9d79cc7c86d42b31b8778faf61e7e35 0 mit_77.RULE \n",
+ " 1c3a8a412cca20678559fce34b46686c39a835c0 0 mit_34.RULE \n",
+ " 1 mit_77.RULE \n",
+ " 2 pypi_mit_license.RULE \n",
+ " 3be892ba5a4cdc550ac746e257e8e944e79561de 0 mit_160.RULE \n",
+ " 3d5143b5fffba7880c3c2d10d73ae39740dbdd2c 0 public-domain_45.RULE \n",
+ " 4828905332706cdfebe3dfe9f699bb6616abfd76 0 public-domain_45.RULE \n",
+ " 7e75216e98490df3995164be00d5d6d4ae8d63cf 0 pypi_mit_license.RULE \n",
+ " 9d5dc47d7b122dbfb7005e627a352ca001dddb73 0 mit_160.RULE \n",
+ " a5fddef68a032cd03b957cca5076c2244480a34b 0 public-domain_15.RULE \n",
+ " a6d104bee2f3a7610bb10ad379bfb6b0d3466f89 0 mit_34.RULE \n",
+ " abcd264d3854a8b5cc987f265fb581b8c59ebe50 0 public-domain_45.RULE \n",
+ " da97ffed59276dce11663a7ddb468112811debeb 0 mit_160.RULE \n",
+ "2019-10-10 21:19:38.969000+00:00 cdff21aa9a0f6aa5341e1c971146d1fc30febd1f 0 mit_34.RULE \n",
+ "2020-06-02 13:30:17.825000+00:00 10117bb7a0b195dc7b6aeaae78be69c10fa7d620 0 gpl-2.0_23.RULE \n",
+ " 1 gpl-2.0-plus_310.RULE \n",
+ " 254d7d285416df46733e9541e4da21fc498d3502 0 mit_34.RULE \n",
+ " 4b6af7167d38d0751614ad390908dc4a8e0ef652 0 gpl-2.0_550.RULE \n",
+ "\n",
+ " is_license_text_lic \\\n",
+ " sha1 lic_det_num \n",
+ "2019-04-17 14:19:10.672000+00:00 c31f1358e9e15586333a3a91298e770a9d360867 0 True \n",
+ " e5dad7813f1edf787936eedfa92c421bf28080c3 0 False \n",
+ "2020-05-26 21:20:17.862000+00:00 15ecee2af9d79cc7c86d42b31b8778faf61e7e35 0 False \n",
+ " 1c3a8a412cca20678559fce34b46686c39a835c0 0 False \n",
+ " 1 False \n",
+ " 2 False \n",
+ " 3be892ba5a4cdc550ac746e257e8e944e79561de 0 True \n",
+ " 3d5143b5fffba7880c3c2d10d73ae39740dbdd2c 0 True \n",
+ " 4828905332706cdfebe3dfe9f699bb6616abfd76 0 True \n",
+ " 7e75216e98490df3995164be00d5d6d4ae8d63cf 0 False \n",
+ " 9d5dc47d7b122dbfb7005e627a352ca001dddb73 0 True \n",
+ " a5fddef68a032cd03b957cca5076c2244480a34b 0 True \n",
+ " a6d104bee2f3a7610bb10ad379bfb6b0d3466f89 0 False \n",
+ " abcd264d3854a8b5cc987f265fb581b8c59ebe50 0 True \n",
+ " da97ffed59276dce11663a7ddb468112811debeb 0 True \n",
+ "2019-10-10 21:19:38.969000+00:00 cdff21aa9a0f6aa5341e1c971146d1fc30febd1f 0 False \n",
+ "2020-06-02 13:30:17.825000+00:00 10117bb7a0b195dc7b6aeaae78be69c10fa7d620 0 False \n",
+ " 1 False \n",
+ " 254d7d285416df46733e9541e4da21fc498d3502 0 False \n",
+ " 4b6af7167d38d0751614ad390908dc4a8e0ef652 0 False \n",
+ "\n",
+ " is_license_notice \\\n",
+ " sha1 lic_det_num \n",
+ "2019-04-17 14:19:10.672000+00:00 c31f1358e9e15586333a3a91298e770a9d360867 0 False \n",
+ " e5dad7813f1edf787936eedfa92c421bf28080c3 0 False \n",
+ "2020-05-26 21:20:17.862000+00:00 15ecee2af9d79cc7c86d42b31b8778faf61e7e35 0 False \n",
+ " 1c3a8a412cca20678559fce34b46686c39a835c0 0 False \n",
+ " 1 False \n",
+ " 2 False \n",
+ " 3be892ba5a4cdc550ac746e257e8e944e79561de 0 False \n",
+ " 3d5143b5fffba7880c3c2d10d73ae39740dbdd2c 0 False \n",
+ " 4828905332706cdfebe3dfe9f699bb6616abfd76 0 False \n",
+ " 7e75216e98490df3995164be00d5d6d4ae8d63cf 0 False \n",
+ " 9d5dc47d7b122dbfb7005e627a352ca001dddb73 0 False \n",
+ " a5fddef68a032cd03b957cca5076c2244480a34b 0 False \n",
+ " a6d104bee2f3a7610bb10ad379bfb6b0d3466f89 0 False \n",
+ " abcd264d3854a8b5cc987f265fb581b8c59ebe50 0 False \n",
+ " da97ffed59276dce11663a7ddb468112811debeb 0 False \n",
+ "2019-10-10 21:19:38.969000+00:00 cdff21aa9a0f6aa5341e1c971146d1fc30febd1f 0 False \n",
+ "2020-06-02 13:30:17.825000+00:00 10117bb7a0b195dc7b6aeaae78be69c10fa7d620 0 False \n",
+ " 1 False \n",
+ " 254d7d285416df46733e9541e4da21fc498d3502 0 False \n",
+ " 4b6af7167d38d0751614ad390908dc4a8e0ef652 0 False \n",
+ "\n",
+ " is_license_reference \\\n",
+ " sha1 lic_det_num \n",
+ "2019-04-17 14:19:10.672000+00:00 c31f1358e9e15586333a3a91298e770a9d360867 0 False \n",
+ " e5dad7813f1edf787936eedfa92c421bf28080c3 0 False \n",
+ "2020-05-26 21:20:17.862000+00:00 15ecee2af9d79cc7c86d42b31b8778faf61e7e35 0 True \n",
+ " 1c3a8a412cca20678559fce34b46686c39a835c0 0 False \n",
+ " 1 True \n",
+ " 2 False \n",
+ " 3be892ba5a4cdc550ac746e257e8e944e79561de 0 False \n",
+ " 3d5143b5fffba7880c3c2d10d73ae39740dbdd2c 0 False \n",
+ " 4828905332706cdfebe3dfe9f699bb6616abfd76 0 False \n",
+ " 7e75216e98490df3995164be00d5d6d4ae8d63cf 0 False \n",
+ " 9d5dc47d7b122dbfb7005e627a352ca001dddb73 0 False \n",
+ " a5fddef68a032cd03b957cca5076c2244480a34b 0 False \n",
+ " a6d104bee2f3a7610bb10ad379bfb6b0d3466f89 0 False \n",
+ " abcd264d3854a8b5cc987f265fb581b8c59ebe50 0 False \n",
+ " da97ffed59276dce11663a7ddb468112811debeb 0 False \n",
+ "2019-10-10 21:19:38.969000+00:00 cdff21aa9a0f6aa5341e1c971146d1fc30febd1f 0 False \n",
+ "2020-06-02 13:30:17.825000+00:00 10117bb7a0b195dc7b6aeaae78be69c10fa7d620 0 False \n",
+ " 1 True \n",
+ " 254d7d285416df46733e9541e4da21fc498d3502 0 False \n",
+ " 4b6af7167d38d0751614ad390908dc4a8e0ef652 0 False \n",
+ "\n",
+ " is_license_tag \\\n",
+ " sha1 lic_det_num \n",
+ "2019-04-17 14:19:10.672000+00:00 c31f1358e9e15586333a3a91298e770a9d360867 0 False \n",
+ " e5dad7813f1edf787936eedfa92c421bf28080c3 0 True \n",
+ "2020-05-26 21:20:17.862000+00:00 15ecee2af9d79cc7c86d42b31b8778faf61e7e35 0 False \n",
+ " 1c3a8a412cca20678559fce34b46686c39a835c0 0 True \n",
+ " 1 False \n",
+ " 2 True \n",
+ " 3be892ba5a4cdc550ac746e257e8e944e79561de 0 False \n",
+ " 3d5143b5fffba7880c3c2d10d73ae39740dbdd2c 0 False \n",
+ " 4828905332706cdfebe3dfe9f699bb6616abfd76 0 False \n",
+ " 7e75216e98490df3995164be00d5d6d4ae8d63cf 0 True \n",
+ " 9d5dc47d7b122dbfb7005e627a352ca001dddb73 0 False \n",
+ " a5fddef68a032cd03b957cca5076c2244480a34b 0 False \n",
+ " a6d104bee2f3a7610bb10ad379bfb6b0d3466f89 0 True \n",
+ " abcd264d3854a8b5cc987f265fb581b8c59ebe50 0 False \n",
+ " da97ffed59276dce11663a7ddb468112811debeb 0 False \n",
+ "2019-10-10 21:19:38.969000+00:00 cdff21aa9a0f6aa5341e1c971146d1fc30febd1f 0 True \n",
+ "2020-06-02 13:30:17.825000+00:00 10117bb7a0b195dc7b6aeaae78be69c10fa7d620 0 True \n",
+ " 1 False \n",
+ " 254d7d285416df46733e9541e4da21fc498d3502 0 True \n",
+ " 4b6af7167d38d0751614ad390908dc4a8e0ef652 0 True \n",
+ "\n",
+ " matcher \\\n",
+ " sha1 lic_det_num \n",
+ "2019-04-17 14:19:10.672000+00:00 c31f1358e9e15586333a3a91298e770a9d360867 0 3-seq \n",
+ " e5dad7813f1edf787936eedfa92c421bf28080c3 0 2-aho \n",
+ "2020-05-26 21:20:17.862000+00:00 15ecee2af9d79cc7c86d42b31b8778faf61e7e35 0 2-aho \n",
+ " 1c3a8a412cca20678559fce34b46686c39a835c0 0 2-aho \n",
+ " 1 2-aho \n",
+ " 2 2-aho \n",
+ " 3be892ba5a4cdc550ac746e257e8e944e79561de 0 3-seq \n",
+ " 3d5143b5fffba7880c3c2d10d73ae39740dbdd2c 0 2-aho \n",
+ " 4828905332706cdfebe3dfe9f699bb6616abfd76 0 2-aho \n",
+ " 7e75216e98490df3995164be00d5d6d4ae8d63cf 0 2-aho \n",
+ " 9d5dc47d7b122dbfb7005e627a352ca001dddb73 0 3-seq \n",
+ " a5fddef68a032cd03b957cca5076c2244480a34b 0 2-aho \n",
+ " a6d104bee2f3a7610bb10ad379bfb6b0d3466f89 0 2-aho \n",
+ " abcd264d3854a8b5cc987f265fb581b8c59ebe50 0 2-aho \n",
+ " da97ffed59276dce11663a7ddb468112811debeb 0 3-seq \n",
+ "2019-10-10 21:19:38.969000+00:00 cdff21aa9a0f6aa5341e1c971146d1fc30febd1f 0 2-aho \n",
+ "2020-06-02 13:30:17.825000+00:00 10117bb7a0b195dc7b6aeaae78be69c10fa7d620 0 2-aho \n",
+ " 1 3-seq \n",
+ " 254d7d285416df46733e9541e4da21fc498d3502 0 2-aho \n",
+ " 4b6af7167d38d0751614ad390908dc4a8e0ef652 0 2-aho \n",
+ "\n",
+ " rule_length \\\n",
+ " sha1 lic_det_num \n",
+ "2019-04-17 14:19:10.672000+00:00 c31f1358e9e15586333a3a91298e770a9d360867 0 167 \n",
+ " e5dad7813f1edf787936eedfa92c421bf28080c3 0 2 \n",
+ "2020-05-26 21:20:17.862000+00:00 15ecee2af9d79cc7c86d42b31b8778faf61e7e35 0 4 \n",
+ " 1c3a8a412cca20678559fce34b46686c39a835c0 0 2 \n",
+ " 1 4 \n",
+ " 2 5 \n",
+ " 3be892ba5a4cdc550ac746e257e8e944e79561de 0 167 \n",
+ " 3d5143b5fffba7880c3c2d10d73ae39740dbdd2c 0 5 \n",
+ " 4828905332706cdfebe3dfe9f699bb6616abfd76 0 5 \n",
+ " 7e75216e98490df3995164be00d5d6d4ae8d63cf 0 5 \n",
+ " 9d5dc47d7b122dbfb7005e627a352ca001dddb73 0 167 \n",
+ " a5fddef68a032cd03b957cca5076c2244480a34b 0 5 \n",
+ " a6d104bee2f3a7610bb10ad379bfb6b0d3466f89 0 2 \n",
+ " abcd264d3854a8b5cc987f265fb581b8c59ebe50 0 5 \n",
+ " da97ffed59276dce11663a7ddb468112811debeb 0 167 \n",
+ "2019-10-10 21:19:38.969000+00:00 cdff21aa9a0f6aa5341e1c971146d1fc30febd1f 0 2 \n",
+ "2020-06-02 13:30:17.825000+00:00 10117bb7a0b195dc7b6aeaae78be69c10fa7d620 0 2 \n",
+ " 1 12 \n",
+ " 254d7d285416df46733e9541e4da21fc498d3502 0 2 \n",
+ " 4b6af7167d38d0751614ad390908dc4a8e0ef652 0 4 \n",
+ "\n",
+ " matched_length \\\n",
+ " sha1 lic_det_num \n",
+ "2019-04-17 14:19:10.672000+00:00 c31f1358e9e15586333a3a91298e770a9d360867 0 166 \n",
+ " e5dad7813f1edf787936eedfa92c421bf28080c3 0 2 \n",
+ "2020-05-26 21:20:17.862000+00:00 15ecee2af9d79cc7c86d42b31b8778faf61e7e35 0 4 \n",
+ " 1c3a8a412cca20678559fce34b46686c39a835c0 0 2 \n",
+ " 1 4 \n",
+ " 2 5 \n",
+ " 3be892ba5a4cdc550ac746e257e8e944e79561de 0 166 \n",
+ " 3d5143b5fffba7880c3c2d10d73ae39740dbdd2c 0 5 \n",
+ " 4828905332706cdfebe3dfe9f699bb6616abfd76 0 5 \n",
+ " 7e75216e98490df3995164be00d5d6d4ae8d63cf 0 5 \n",
+ " 9d5dc47d7b122dbfb7005e627a352ca001dddb73 0 166 \n",
+ " a5fddef68a032cd03b957cca5076c2244480a34b 0 5 \n",
+ " a6d104bee2f3a7610bb10ad379bfb6b0d3466f89 0 2 \n",
+ " abcd264d3854a8b5cc987f265fb581b8c59ebe50 0 5 \n",
+ " da97ffed59276dce11663a7ddb468112811debeb 0 166 \n",
+ "2019-10-10 21:19:38.969000+00:00 cdff21aa9a0f6aa5341e1c971146d1fc30febd1f 0 2 \n",
+ "2020-06-02 13:30:17.825000+00:00 10117bb7a0b195dc7b6aeaae78be69c10fa7d620 0 2 \n",
+ " 1 11 \n",
+ " 254d7d285416df46733e9541e4da21fc498d3502 0 2 \n",
+ " 4b6af7167d38d0751614ad390908dc4a8e0ef652 0 4 \n",
+ "\n",
+ " match_coverage \\\n",
+ " sha1 lic_det_num \n",
+ "2019-04-17 14:19:10.672000+00:00 c31f1358e9e15586333a3a91298e770a9d360867 0 99.40 \n",
+ " e5dad7813f1edf787936eedfa92c421bf28080c3 0 100.00 \n",
+ "2020-05-26 21:20:17.862000+00:00 15ecee2af9d79cc7c86d42b31b8778faf61e7e35 0 100.00 \n",
+ " 1c3a8a412cca20678559fce34b46686c39a835c0 0 100.00 \n",
+ " 1 100.00 \n",
+ " 2 100.00 \n",
+ " 3be892ba5a4cdc550ac746e257e8e944e79561de 0 99.40 \n",
+ " 3d5143b5fffba7880c3c2d10d73ae39740dbdd2c 0 100.00 \n",
+ " 4828905332706cdfebe3dfe9f699bb6616abfd76 0 100.00 \n",
+ " 7e75216e98490df3995164be00d5d6d4ae8d63cf 0 100.00 \n",
+ " 9d5dc47d7b122dbfb7005e627a352ca001dddb73 0 99.40 \n",
+ " a5fddef68a032cd03b957cca5076c2244480a34b 0 100.00 \n",
+ " a6d104bee2f3a7610bb10ad379bfb6b0d3466f89 0 100.00 \n",
+ " abcd264d3854a8b5cc987f265fb581b8c59ebe50 0 100.00 \n",
+ " da97ffed59276dce11663a7ddb468112811debeb 0 99.40 \n",
+ "2019-10-10 21:19:38.969000+00:00 cdff21aa9a0f6aa5341e1c971146d1fc30febd1f 0 100.00 \n",
+ "2020-06-02 13:30:17.825000+00:00 10117bb7a0b195dc7b6aeaae78be69c10fa7d620 0 100.00 \n",
+ " 1 91.67 \n",
+ " 254d7d285416df46733e9541e4da21fc498d3502 0 100.00 \n",
+ " 4b6af7167d38d0751614ad390908dc4a8e0ef652 0 100.00 \n",
+ "\n",
+ " rule_relevance \n",
+ " sha1 lic_det_num \n",
+ "2019-04-17 14:19:10.672000+00:00 c31f1358e9e15586333a3a91298e770a9d360867 0 100 \n",
+ " e5dad7813f1edf787936eedfa92c421bf28080c3 0 99 \n",
+ "2020-05-26 21:20:17.862000+00:00 15ecee2af9d79cc7c86d42b31b8778faf61e7e35 0 22 \n",
+ " 1c3a8a412cca20678559fce34b46686c39a835c0 0 99 \n",
+ " 1 22 \n",
+ " 2 27 \n",
+ " 3be892ba5a4cdc550ac746e257e8e944e79561de 0 100 \n",
+ " 3d5143b5fffba7880c3c2d10d73ae39740dbdd2c 0 27 \n",
+ " 4828905332706cdfebe3dfe9f699bb6616abfd76 0 27 \n",
+ " 7e75216e98490df3995164be00d5d6d4ae8d63cf 0 27 \n",
+ " 9d5dc47d7b122dbfb7005e627a352ca001dddb73 0 100 \n",
+ " a5fddef68a032cd03b957cca5076c2244480a34b 0 100 \n",
+ " a6d104bee2f3a7610bb10ad379bfb6b0d3466f89 0 99 \n",
+ " abcd264d3854a8b5cc987f265fb581b8c59ebe50 0 27 \n",
+ " da97ffed59276dce11663a7ddb468112811debeb 0 100 \n",
+ "2019-10-10 21:19:38.969000+00:00 cdff21aa9a0f6aa5341e1c971146d1fc30febd1f 0 99 \n",
+ "2020-06-02 13:30:17.825000+00:00 10117bb7a0b195dc7b6aeaae78be69c10fa7d620 0 100 \n",
+ " 1 66 \n",
+ " 254d7d285416df46733e9541e4da21fc498d3502 0 99 \n",
+ " 4b6af7167d38d0751614ad390908dc4a8e0ef652 0 100 \n",
+ "\n",
+ "[20 rows x 37 columns]"
+ ]
+ },
+ "execution_count": 21,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "main_dataframe.head(20)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.10"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/src/notebooks/postgres.ipynb b/src/notebooks/postgres.ipynb
new file mode 100644
index 0000000..ac385a4
--- /dev/null
+++ b/src/notebooks/postgres.ipynb
@@ -0,0 +1,140 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# `postgres.py`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import sys\n",
+ "sys.path.append('/home/ayan/Desktop/nexB/gsoc20/scancode-results-analyzer/src')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Import Class `PostgresFetch` and initialize Object"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from results_analyze.postgres import PostgresFetch\n",
+ "post_ayan = PostgresFetch()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Formats Query from Postgres Database `PostgresFetch.format_query`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {
+ "collapsed": false,
+ "jupyter": {
+ "outputs_hidden": false
+ },
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "\"SELECT path, content FROM clearcode_cditem WHERE path like '%/scancode/%'OFFSET 0 ROWS FETCH FIRST 10 ROW ONLY;\""
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "post_ayan.format_query(num_rows=10)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Fetch Data from Postgres Database `PostgresFetch.fetch_data`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[('git/github/zzarcon/video-snapshot/revision/b56386f225a2d082b1e56a0f24cfda595798c654/tool/scancode/3.2.2.json',\n",
+ " ),\n",
+ " ('pypi/pypi/-/ccxt/revision/1.28.76/tool/scancode/3.2.2.json',\n",
+ " ),\n",
+ " ('composer/packagist/motor-cms/motor-core/revision/0.9.1/tool/scancode/3.2.2.json',\n",
+ " ),\n",
+ " ('composer/packagist/lucatume/wp-browser/revision/2.5.7/tool/scancode/3.2.2.json',\n",
+ " ),\n",
+ " ('npm/npmjs/-/aws-sdk/revision/2.687.0/tool/scancode/3.2.2.json',\n",
+ " ),\n",
+ " ('npm/npmjs/@types/babel__core/revision/7.1.8/tool/scancode/3.2.2.json',\n",
+ " ),\n",
+ " ('composer/packagist/topshelfcraft/wordsmith/revision/3.1.0/tool/scancode/3.2.2.json',\n",
+ " ),\n",
+ " ('pypi/pypi/-/youtube_dl/revision/2014.01.22.4/tool/scancode/3.2.2.json',\n",
+ " ),\n",
+ " ('composer/packagist/qingbing/php-file-cache/revision/1.0.1/tool/scancode/3.2.2.json',\n",
+ " ),\n",
+ " ('git/github/fnando/browser/revision/9ada0e23745b82eef15035fc86fe560f826e0018/tool/scancode/3.2.2.json',\n",
+ " )]"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "post_ayan.fetch_data(10)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.10"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/src/results_analyze/data/credentials.json b/src/results_analyze/data/credentials.json
new file mode 100644
index 0000000..80663a4
--- /dev/null
+++ b/src/results_analyze/data/credentials.json
@@ -0,0 +1 @@
+{"user": "clearcode", "password": "REPLACE_WITH_PASSWORD", "host": "127.0.0.1", "port": "5432", "database": "clearcode"}
\ No newline at end of file
diff --git a/src/results_analyze/load_results_file.py b/src/results_analyze/load_results_file.py
new file mode 100644
index 0000000..4227615
--- /dev/null
+++ b/src/results_analyze/load_results_file.py
@@ -0,0 +1,146 @@
+#
+# Copyright (c) nexB Inc. and others. All rights reserved.
+# http://nexb.com and https://github.com/nexB/scancode-toolkit/
+# The ScanCode software is licensed under the Apache License version 2.0.
+# Data generated with ScanCode require an acknowledgment.
+# ScanCode is a trademark of nexB Inc.
+#
+# You may not use this software except in compliance with the License.
+# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+#
+# When you publish or redistribute any data created with ScanCode or any ScanCode
+# derivative work, you must accompany this data with the following acknowledgment:
+#
+# Generated with ScanCode and provided on an "AS IS" BASIS, WITHOUT WARRANTIES
+# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
+# ScanCode should be considered or used as legal advice. Consult an Attorney
+# for any legal advice.
+# ScanCode is a free software code scanning tool from nexB Inc. and others.
+# Visit https://github.com/nexB/scancode-toolkit/ for support and download.
+
+
+import numpy as np
+import pandas as pd
+
+
+class ResultsDataFrameFile:
+ def __init__(self):
+ # Which columns to drop from a File Level Dataframe.
+ self.drop_columns_list_file_lev = ['type', 'name', 'base_name', 'extension', 'date', 'md5',
+ 'license_expressions', 'holders', 'copyrights', 'authors', 'packages',
+ 'emails', 'urls', 'files_count', 'dirs_count', 'size_count', 'scan_errors']
+ # Which columns to drop from a License Level Dataframe.
+ self.drop_columns_list_lic_lev = ['name', 'short_name', 'owner', 'homepage_url', 'text_url', 'reference_url',
+ 'spdx_license_key', 'spdx_url', 'license_expression', 'matched_rule',
+ 'licenses']
+
+ @staticmethod
+ def dict_to_rows_matched_rule_dataframes_apply(dataframe):
+ """
+ Makes Dicts keys inside dict 'matched_rule' -> Columns of License level DataFrame.
+
+ :param dataframe: pd.DataFrame
+ """
+ new_df = pd.DataFrame(list(dataframe['matched_rule']))
+
+ # Merge By Index, which basically Appends Column-Wise
+ dataframe = dataframe.join(new_df)
+
+ return dataframe
+
+ def modify_lic_level_dataframe(self, dataframe_lic):
+ """
+ Modifies License level DataFrame, from 'matched_rule' dicts, bring information to columns.
+ Maps Rule Names and other strings to integer values to compress.
+
+ :param dataframe_lic: pd.DataFrame
+ :return dataframe_lic: pd.DataFrame
+ """
+ # From dict 'matched_rule' expand keys to DataFrame Columns
+ dataframe_lic_rule = self.dict_to_rows_matched_rule_dataframes_apply(dataframe_lic)
+
+ # Drops Unnecessary Columns
+ dataframe_lic_rule.drop(columns=self.drop_columns_list_lic_lev, inplace=True)
+
+ # ToDo: Map Strings to Int Values
+
+ return dataframe_lic_rule
+
+ def create_lic_level_dataframe(self, file_level_dataframe):
+ """
+ Takes a File Level DataFrame, creates license level dataframes, modifies and cleans them up and
+ appends columns to file level dataframes. Here, already existing file level info is also present at each
+ license level rows.
+
+ :param file_level_dataframe: pd.DataFrame
+ :returns merged_df: pd.DataFrame
+ """
+ # For each file, add license level dict-keys to new columns, and multiple licenses per file into new rows
+ # Introduces new column 'level_1'(renamed to 'lic_det_num'), which is the primary key for
+ # each license detection inside one file.
+ lic_level_dataframe = file_level_dataframe.groupby('sha1').licenses.apply(
+ lambda x: pd.DataFrame(x.values[0])).reset_index()
+ lic_level_dataframe.rename(columns={'level_1': 'lic_det_num'}, inplace=True)
+
+ # Modifies license level information
+ lic_level_dataframe = self.modify_lic_level_dataframe(lic_level_dataframe)
+
+ # makes sha1 column as the file level Index [Primary Key].
+ lic_level_dataframe.set_index('sha1', inplace=True)
+
+ merged_df = file_level_dataframe.join(lic_level_dataframe, lsuffix='_file', rsuffix='_lic')
+ merged_df.reset_index(inplace=True)
+
+ return merged_df
+
+ def modify_file_level_dataframe(self, dataframe_files):
+ """
+ Takes a File Level DataFrame, drops unnecessary columns, drops all directory rows, drops same files,
+ drop files with no license detections, and makes sha1 column as the file level Index [Primary Key].
+
+ :param dataframe_files: pd.DataFrame
+ File Level DataFrames
+ """
+ # Drops Unnecessary Columns
+ dataframe_files.drop(columns=self.drop_columns_list_file_lev, inplace=True)
+
+ # Drops all rows with file_type as directories, as they have `NaN` as their `sha1` values
+ dataframe_files.dropna(subset=['sha1'], inplace=True)
+
+ # Add a column number of license detections per file, and drop files with no license detections
+ dataframe_files['license_detections_no'] = dataframe_files.licenses.apply(lambda x: np.shape(x)[0])
+ dataframe_files.drop(dataframe_files[~ (dataframe_files['license_detections_no'] > 0)].index, inplace=True)
+
+ # Drops files that have the same sha1 hash, i.e. essentially similar files
+ dataframe_files.drop_duplicates(subset='sha1', keep="last", inplace=True)
+
+ # Makes SHA1 column the index (Slows down calculations)
+ dataframe_files.set_index('sha1', inplace=True)
+
+ return
+
+ def create_file_level_dataframe(self, package_files_list):
+ """
+ Creates a File and License Level DataFrame
+
+ :param package_files_list: list of file level dicts
+ :returns file_and_lic_level_dataframe: pd.DataFrame
+ Has File and License level information organized via pd.MultiIndex.
+ """
+ # Initialize the file package level list into a DataFrame (Row - Files, Columns - Dict keys inside Files)
+ file_level_dataframe = pd.DataFrame(package_files_list)
+
+ # Clean Up and Modify the File Level DataFrame
+ self.modify_file_level_dataframe(file_level_dataframe)
+
+ # From column 'licenses', which is a list of dicts, create License Level DataFrames
+ file_and_lic_level_dataframe = self.create_lic_level_dataframe(file_level_dataframe)
+
+ # Sets 'sha1' and 'lic_det_num' columns as the Indexes (Primary Key Tuple)
+ file_and_lic_level_dataframe.set_index(['sha1', 'lic_det_num'], inplace=True)
+
+ return file_and_lic_level_dataframe
diff --git a/src/results_analyze/load_results_package.py b/src/results_analyze/load_results_package.py
new file mode 100644
index 0000000..f0acfba
--- /dev/null
+++ b/src/results_analyze/load_results_package.py
@@ -0,0 +1,303 @@
+#
+# Copyright (c) nexB Inc. and others. All rights reserved.
+# http://nexb.com and https://github.com/nexB/scancode-toolkit/
+# The ScanCode software is licensed under the Apache License version 2.0.
+# Data generated with ScanCode require an acknowledgment.
+# ScanCode is a trademark of nexB Inc.
+#
+# You may not use this software except in compliance with the License.
+# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+#
+# When you publish or redistribute any data created with ScanCode or any ScanCode
+# derivative work, you must accompany this data with the following acknowledgment:
+#
+# Generated with ScanCode and provided on an "AS IS" BASIS, WITHOUT WARRANTIES
+# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
+# ScanCode should be considered or used as legal advice. Consult an Attorney
+# for any legal advice.
+# ScanCode is a free software code scanning tool from nexB Inc. and others.
+# Visit https://github.com/nexB/scancode-toolkit/ for support and download.
+
+import os
+
+import gzip
+import json
+import pandas as pd
+
+from results_analyze.postgres import PostgresFetch
+from results_analyze.load_results_file import ResultsDataFrameFile
+
+# How many rows of Database to Fetch at once
+# ToDo: Calculation Function based on memory usage stats and RAM/SWAP Available
+NUM_ROWS_TO_FETCH = 20
+
+# 'table' (A bit slower, On-Disk Search/Query Enabled) or 'fixed' (Fast, No On-Disk Search/Query)
+HDF5_STORE_FORMAT = 'table'
+
+
+class ResultsDataFramePackage:
+
+ def __init__(self):
+ """
+ Constructor for ResultsDataFramePackage, initialized PostgresFetch and ResultsDataFrameFile objects,
+ and data paths and filenames used.
+ """
+ self.postgres = PostgresFetch()
+ self.results_file = ResultsDataFrameFile()
+ self.metadata_filename = 'projects_metadata.h5'
+ self.hdf_dir = os.path.join(os.path.dirname(__file__), 'data/hdf5/')
+
+ @staticmethod
+ def get_hdf5_file_path(hdf_dir, filename):
+ """
+ Gets filepath.
+
+ :param hdf_dir : string
+ :param filename : string
+
+ :returns filepath : os.Path
+ """
+ file_path = os.path.join(hdf_dir, filename)
+ return file_path
+
+ # ToDo: Support Selective Query/Search
+ @staticmethod
+ def load_dataframe_from_hdf5(file_path, df_key):
+ """
+ Loads data from the hdf5 to a Pandas Dataframe.
+
+ :param file_path : string
+ :param df_key : string
+
+ :returns filepath : pd.DataFrame object containing the Data read from the hdf5 file.
+ """
+
+ dataframe = pd.read_hdf(path_or_buf=file_path, key=df_key)
+
+ return dataframe
+
+ @staticmethod
+ def store_dataframe_to_hdf5(dataframe, file_path, df_key, h5_format=HDF5_STORE_FORMAT, is_append=False):
+ """
+ Stores data from the a Pandas Dataframe to hdf5.
+
+ :param dataframe : pd.Dataframe
+ The DataFrame which has to be stored
+ :param file_path : string
+ :param df_key: string
+ Table name inside the h5 file for this dataframe.
+ :param h5_format : string
+ PyTables storage format
+ :param is_append : bool
+ """
+
+ if is_append:
+ # File has to exist
+ dataframe.to_hdf(path_or_buf=file_path, key=df_key, mode='r+', format=h5_format)
+ else:
+ dataframe.to_hdf(path_or_buf=file_path, key=df_key, mode='w', format=h5_format)
+
+ def append_metadata_dataframe(self, metadata_dataframe):
+ """
+ Stores data from the a Pandas Dataframe, containing metadata to hdf5. Creates file if file doesn't exist.
+
+ :param metadata_dataframe : pd.Dataframe
+ The metadata DataFrame which has to be appended
+ """
+
+ file_path = os.path.join(self.get_hdf5_file_path(self.hdf_dir, self.metadata_filename))
+
+ if not os.path.isfile(self.get_hdf5_file_path(self.hdf_dir, filename=self.metadata_filename)):
+ self.store_dataframe_to_hdf5(metadata_dataframe, file_path, df_key='metadata',
+ h5_format='Table', is_append=False)
+ else:
+ self.store_dataframe_to_hdf5(metadata_dataframe, file_path, df_key='metadata',
+ h5_format='Table', is_append=True)
+
+ @staticmethod
+ def decompress_dataframe(compressed_dataframe):
+ """
+ This function is applied to one column of a Dataframe containing memoryview objects, at once,
+ using the DataFrame.apply() method, to perform vectorized decompression. Returns a Pandas Series object
+ each row having the corresponding JSON dict.
+
+ :param compressed_dataframe : pd.Series
+ One column of a DataFrame, containing Compressed memoryview objects.
+
+ :returns decompressed_dataframe : pd.Series
+ One column of a DataFrame, containing JSON dicts of Scan Results.
+ """
+ string_json = gzip.decompress(compressed_dataframe).decode('utf-8')
+ decompressed_dataframe = json.loads(string_json)
+
+ return decompressed_dataframe
+
+ def convert_records_to_json(self, num_rows_to_fetch=NUM_ROWS_TO_FETCH):
+ """
+ Fetch scan_results from Postgres Database, Load into Pandas Dataframes, and Decompress into JSON dicts.
+
+ :param num_rows_to_fetch : int
+ Number of Rows to Fetch from the Postgres Database, which is essentially the number of packages scanned.
+
+ :returns dataframe_memoryview : pd.DataFrame
+ DataFrame containing two Columns 'path' and 'json_content'.
+ """
+ # Fetch A specified rows of Data From postgres Database, and load into a DataFrame
+ data_memoryview = self.postgres.fetch_data(num_rows_to_fetch)
+ dataframe_memoryview = pd.DataFrame(data_memoryview, columns=['path', 'memoryview'])
+
+ # Decompress entire `memoryview` column, add decompressed JSON dicts at `json_content`, then drop former.
+ dataframe_memoryview['json_content'] = dataframe_memoryview.memoryview.apply(self.decompress_dataframe)
+ dataframe_memoryview.drop(columns=['memoryview'], inplace=True)
+
+ return dataframe_memoryview
+
+ @staticmethod
+ def dict_to_rows_in_dataframes(dataframe, key_1, key_2):
+ """
+ This function is applied to one column of a Dataframe containing json dicts, at once,
+ using the DataFrame.apply() method, to perform vectorized data retrieval.
+
+ :param dataframe : pd.Series
+ One column of a DataFrame, containing json dicts.
+ :param key_1 : string
+ :param key_2 : string
+
+ :returns row_data : pd.Series
+ One column of a DataFrame, containing values/dicts/lists that were inside those JSON dicts.
+ """
+ row_data = dataframe[key_1][key_2]
+
+ return row_data
+
+ def dict_to_rows_in_dataframes_apply(self, dataframe, key_1, key_2):
+ """
+ This function is applied to one column of a Dataframe containing json dicts, at once, to perform
+ vectorized data retrieval. Then convert the column of dicts to a list of dicts, to create dataframes from them.
+ The DataFrames columns are those dict keys.
+
+ :param dataframe : pd.DataFrame
+ DataFrame, containing json dicts in a column.
+ :param key_1 : string
+ :param key_2 : string
+
+ :returns dataframe : pd.DataFrame
+ DataFrame, containing new columns for each dict keys, from the dict inside the JSON dict.
+ """
+ dataframe_dicts = dataframe.json_content.apply(self.dict_to_rows_in_dataframes, args=(key_1, key_2))
+ new_df = pd.DataFrame(list(dataframe_dicts))
+
+ # Merge By Index, which basically Appends Column-Wise
+ dataframe = dataframe.join(new_df)
+
+ return dataframe
+
+ def value_to_rows_in_dataframes_apply(self, dataframe, key_1, key_2, name_value):
+ """
+ This function is applied to one column of a Dataframe containing json dicts, at once, to perform
+ vectorized data retrieval. Then convert this row of values/lists to dataframes.
+ The DataFrames column name is the `name_value`.
+
+ :param dataframe : pd.DataFrame
+ One column of a DataFrame, containing json dicts.
+ :param key_1 : string
+ :param key_2 : string
+ :param name_value : string
+
+ :return dataframe : pd.DataFrame
+ DataFrame, containing a new column for the value/list, from inside the JSON dict.
+ """
+ dataframe_dicts = dataframe.json_content.apply(self.dict_to_rows_in_dataframes, args=(key_1, key_2))
+ new_df = pd.DataFrame({name_value: dataframe_dicts})
+
+ # Merge By Index, which basically Appends Column-Wise
+ dataframe = dataframe.join(new_df)
+
+ return dataframe
+
+ @staticmethod
+ def convert_string_to_datetime(dataframe, old_col, new_col):
+ """
+ This function takes a column of string datetime, and converts it into Pandas DatetimeIndex objects.
+
+ :param dataframe : pd.DatFrame
+ :param old_col : string : Name of Old Column
+ :param new_col : string : Name of New Column
+ """
+ # Add Pandas DateTime Column
+ dataframe[new_col] = pd.to_datetime(dataframe[old_col].tolist(), format='%Y-%m-%d')
+
+ # Drop String DateTime Column
+ dataframe.drop(columns=[old_col], inplace=True)
+
+ def modify_package_level_dataframe(self, metadata_dataframe):
+ """
+ This function is applied to one column of a Dataframe containing json dicts, at once, to perform
+ vectorized data retrieval. Then convert this row of values/lists to dataframes.
+ The DataFrames column name is the `name_value`.
+
+ :param metadata_dataframe : pd.DataFrame
+
+ :returns files_dataframe : pd.DataFrame
+ DataFrame, containing a two columns, which has the path_string in one, and has a list of dicts in each row
+ of the other column, which is list of file-level dicts.
+ :returns metadata_dataframe : pd.DataFrame
+ DataFrame, containing a new column for the value/list, from inside the JSON dict.
+ """
+ metadata_dataframe = self.dict_to_rows_in_dataframes_apply(metadata_dataframe, key_1='content',
+ key_2='license_clarity_score')
+ metadata_dataframe = self.value_to_rows_in_dataframes_apply(metadata_dataframe, key_1='_metadata',
+ key_2='processedAt', name_value='TimeProcess')
+ metadata_dataframe = self.value_to_rows_in_dataframes_apply(metadata_dataframe, key_1='content', key_2='files',
+ name_value='Files')
+ metadata_dataframe.drop(columns=['json_content'], inplace=True)
+
+ # Convert TimeProcess to TimeIndex
+ self.convert_string_to_datetime(metadata_dataframe, old_col='TimeProcess', new_col='TimeIndex')
+
+ files_dataframe = metadata_dataframe[['TimeIndex', 'Files']].copy(deep=True)
+ metadata_dataframe.drop(columns=['Files'], inplace=True)
+
+ return files_dataframe, metadata_dataframe
+
+ def create_package_level_dataframe(self):
+ """
+ Creates a Package Level DataFrame, with File/License Information Levels.
+
+ :returns main_dataframe : df.DataFrame object
+ Main Storage DataFrame
+ Has Project, File and License level information organized via pd.MultiIndex.
+ """
+ # Loads Dataframes
+ path_json_dataframe = self.convert_records_to_json()
+
+ # ToDo: Assert Scancode Options
+
+ # Converts information multiple levels inside dicts into columns
+ # Package Level Data, TimeStamp, 'license_clarity_score' values,'files' list -> `New Columns`.
+ files_dataframe, metadata_dataframe = self.modify_package_level_dataframe(path_json_dataframe)
+
+ # Append metadata level information to a MetaData File
+ self.append_metadata_dataframe(metadata_dataframe)
+
+ # Iterate through all rows, (i.e. package scans), and calls file level function for each
+ # Appends the File and License Level DataFrame returned to a List.
+ file_level_dataframes_list = []
+ for package_scan_result in files_dataframe.itertuples():
+ file_level_dataframe = self.results_file.create_file_level_dataframe(package_scan_result[2])
+ file_level_dataframes_list.append(file_level_dataframe)
+
+ # Creates File level keys, which are used to create package level keys in the MultiIndex
+ list_file_level_keys = list(files_dataframe['TimeIndex'])
+
+ # Concatenate File Level Dataframes from the list, and their corresponding keys
+ # into One Package Level Dataframe, using MultiIndex. Rename Primary Key column names.
+ main_dataframe = pd.concat(file_level_dataframes_list,
+ keys=list_file_level_keys)
+ main_dataframe.index.names = ['pkg_scan_time', 'file_sha1', 'lic_det_num']
+
+ return main_dataframe
diff --git a/src/results_analyze/postgres.py b/src/results_analyze/postgres.py
new file mode 100644
index 0000000..cf1009d
--- /dev/null
+++ b/src/results_analyze/postgres.py
@@ -0,0 +1,135 @@
+#
+# Copyright (c) nexB Inc. and others. All rights reserved.
+# http://nexb.com and https://github.com/nexB/scancode-toolkit/
+# The ScanCode software is licensed under the Apache License version 2.0.
+# Data generated with ScanCode require an acknowledgment.
+# ScanCode is a trademark of nexB Inc.
+#
+# You may not use this software except in compliance with the License.
+# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+#
+# When you publish or redistribute any data created with ScanCode or any ScanCode
+# derivative work, you must accompany this data with the following acknowledgment:
+#
+# Generated with ScanCode and provided on an "AS IS" BASIS, WITHOUT WARRANTIES
+# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
+# ScanCode should be considered or used as legal advice. Consult an Attorney
+# for any legal advice.
+# ScanCode is a free software code scanning tool from nexB Inc. and others.
+# Visit https://github.com/nexB/scancode-toolkit/ for support and download.
+
+import os
+
+import psycopg2
+import json
+
+# Global Variables
+# To Format Query
+DATABASE_NAME = "clearcode_cditem"
+TOOL_NAME = "%/scancode/%"
+COLUMNS_RETURN = 'path, content'
+COLUMNS_SEARCH = "path"
+
+
+class PostgresFetch:
+
+ def __init__(self):
+ self.data_dir = os.path.join(os.path.dirname(__file__), 'data')
+ self.cursor, self.connection = self.init_connection()
+ self.offset = 0
+
+ def get_credentials_filepath(self):
+ """
+ Get credentials file os.Path object.
+
+ :returns file_path: os.Path object
+ """
+ file_path = os.path.join(self.data_dir, 'credentials.json')
+ return file_path
+
+ def import_database_credentials(self):
+ """
+ Fetch postgres Database credentials.
+
+ :returns credentials: JSON dict with credentials
+ """
+ file_path = self.get_credentials_filepath()
+
+ with open(file_path) as f:
+ credentials = json.load(f)
+
+ return credentials
+
+ def format_query(self, num_rows):
+ """
+ Formats query string using credentials and offset/row information.
+
+ :param num_rows: No of rows to Fetch from Postgres Database
+ :returns query_string: PostgreSQL query string
+ """
+ query_string = "SELECT {columns_return} FROM {database} WHERE {columns_search} like '{tool_name}'" \
+ "OFFSET {offset} ROWS FETCH FIRST {num_rows} ROW ONLY;".format(
+ columns_return=COLUMNS_RETURN,
+ database=DATABASE_NAME,
+ columns_search=COLUMNS_SEARCH,
+ tool_name=TOOL_NAME,
+ offset=self.offset,
+ num_rows=num_rows
+ )
+
+ return query_string
+
+ def init_connection(self):
+ """
+ Initiate Connection, called with Class Constructor.
+
+ :returns cursor: psycopg2.cursor Object
+ :returns connection: psycopg2.connection Object
+ """
+ # Fetch credentials from data/credentials.json
+ credentials = self.import_database_credentials()
+
+ # Initialize Connection Object
+ connection = psycopg2.connect(user=credentials['user'],
+ password=credentials['password'],
+ host=credentials['host'],
+ port=credentials['port'],
+ database=credentials['database'])
+
+ # Connect to Database
+ cursor = connection.cursor()
+
+ return cursor, connection
+
+ def fetch_data(self, num_rows_to_fetch):
+ """
+ Fetches `path` and `contents` from the Database, into list of Tuples. Example Tuple:
+ ('composer/packagist/phpro/grumphp/revision/0.16.1/tool/scancode/3.2.2.json', )
+
+ :param num_rows_to_fetch: int
+ Number of rows to fetch, i.e. each row is a package scan
+ :returns records: list of tuples
+ """
+ # Fetch contents of rows where path has "/scancode/"
+
+ query_string = self.format_query(num_rows_to_fetch)
+ self.offset += num_rows_to_fetch
+ self.cursor.execute(query_string)
+
+ # Load all the data into record
+ records = self.cursor.fetchall()
+
+ return records
+
+ def close_connection(self):
+ """
+ Closes Postgres connection.
+ """
+ if self.connection:
+ self.cursor.close()
+ self.connection.close()
+ # print("PostgreSQL connection is closed")