Global Database of Events, Language, and Tone
High level overview of the GDELT Project -
GDELT 2.0 GKG Dataset -
GDELT 2.0 GKG Documention - Essential for understanding GDELT GKG 2.0 records -
GCAM - Global Content Analysis Measures (a GKG Column)
Master GKG file list containing all GKG file URLs going back to 2015 -
Latest GKG file list containing file URLs updated every 15 minutes
|-- GkgRecordId: struct (nullable = true)
| |-- Date: long (nullable = true)
| |-- Translingual: boolean (nullable = true)
| |-- NumberInBatch: integer (nullable = true)
|-- V21Date: struct (nullable = true)
| |-- V21Date: timestamp (nullable = true)
|-- V2SrcCollectionId: struct (nullable = true)
| |-- V2SrcCollectionId: string (nullable = true)
|-- V2SrcCmnName: struct (nullable = true)
| |-- V2SrcCmnName: string (nullable = true)
|-- V2DocId: struct (nullable = true)
| |-- V2DocId: string (nullable = true)
|-- V1Counts: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- CountType: string (nullable = true)
| | |-- Count: long (nullable = true)
| | |-- ObjectType: string (nullable = true)
| | |-- LocationType: integer (nullable = true)
| | |-- FullName: string (nullable = true)
| | |-- CountryCode: string (nullable = true)
| | |-- ADM1Code: string (nullable = true)
| | |-- LocationLatitude: decimal(9,7) (nullable = true)
| | |-- LocationLongitude: decimal(10,7) (nullable = true)
| | |-- FeatureId: string (nullable = true)
|-- V21Counts: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- CountType: string (nullable = true)
| | |-- Count: long (nullable = true)
| | |-- ObjectType: string (nullable = true)
| | |-- LocationType: integer (nullable = true)
| | |-- FullName: string (nullable = true)
| | |-- CountryCode: string (nullable = true)
| | |-- ADM1Code: string (nullable = true)
| | |-- LocationLatitude: decimal(9,7) (nullable = true)
| | |-- LocationLongitude: decimal(10,7) (nullable = true)
| | |-- FeatureId: string (nullable = true)
| | |-- CharOffset: integer (nullable = true)
|-- V1Themes: struct (nullable = true)
| |-- V1Theme: array (nullable = true)
| | |-- element: string (containsNull = true)
|-- V2EnhancedThemes: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- V2Theme: string (nullable = true)
| | |-- CharOffset: integer (nullable = true)
|-- V1Locations: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- LocationType: integer (nullable = true)
| | |-- FullName: string (nullable = true)
| | |-- CountryCode: string (nullable = true)
| | |-- ADM1Code: string (nullable = true)
| | |-- LocationLatitude: decimal(9,7) (nullable = true)
| | |-- LocationLongitude: decimal(10,7) (nullable = true)
| | |-- FeatureId: string (nullable = true)
|-- V2Locations: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- LocationType: integer (nullable = true)
| | |-- FullName: string (nullable = true)
| | |-- CountryCode: string (nullable = true)
| | |-- ADM1Code: string (nullable = true)
| | |-- ADM2Code: string (nullable = true)
| | |-- LocationLatitude: decimal(9,7) (nullable = true)
| | |-- LocationLongitude: decimal(10,7) (nullable = true)
| | |-- FeatureId: string (nullable = true)
| | |-- CharOffset: integer (nullable = true)
|-- V1Persons: struct (nullable = true)
| |-- V1Person: array (nullable = true)
| | |-- element: string (containsNull = true)
|-- V2Persons: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- V1Person: string (nullable = true)
| | |-- CharOffset: integer (nullable = true)
|-- V1Orgs: struct (nullable = true)
| |-- V1Org: array (nullable = true)
| | |-- element: string (containsNull = true)
|-- V2Orgs: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- V1Org: string (nullable = true)
| | |-- CharOffset: integer (nullable = true)
|-- V15Tone: struct (nullable = true)
| |-- Tone: double (nullable = true)
| |-- PositiveScore: double (nullable = true)
| |-- NegativeScore: double (nullable = true)
| |-- Polarity: double (nullable = true)
| |-- ActivityRefDensity: double (nullable = true)
| |-- SelfGroupRefDensity: double (nullable = true)
| |-- WordCount: integer (nullable = true)
|-- V21EnhancedDates: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- DateResolution: integer (nullable = true)
| | |-- Month: integer (nullable = true)
| | |-- Day: integer (nullable = true)
| | |-- Year: integer (nullable = true)
| | |-- CharOffset: integer (nullable = true)
|-- V2GCAM: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- DictionaryDimId: string (nullable = true)
| | |-- Score: double (nullable = true)
|-- V21ShareImg: struct (nullable = true)
| |-- V21ShareImg: string (nullable = true)
|-- V21RelImg: struct (nullable = true)
| |-- V21RelImg: array (nullable = true)
| | |-- element: string (containsNull = true)
|-- V21SocImage: struct (nullable = true)
| |-- V21SocImage: array (nullable = true)
| | |-- element: string (containsNull = true)
|-- V21SocVideo: struct (nullable = true)
| |-- V21SocVideo: array (nullable = true)
| | |-- element: string (containsNull = true)
|-- V21Quotations: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- Offset: integer (nullable = true)
| | |-- CharLength: integer (nullable = true)
| | |-- Verb: string (nullable = true)
| | |-- Quote: string (nullable = true)
|-- V21AllNames: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- Name: string (nullable = true)
| | |-- CharOffset: integer (nullable = true)
|-- V21Amounts: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- Amount: double (nullable = true)
| | |-- Object: string (nullable = true)
| | |-- Offset: integer (nullable = true)
|-- V21TransInfo: struct (nullable = true)
| |-- Srclc: string (nullable = true)
| |-- Eng: string (nullable = true)
|-- V2ExtrasXML: struct (nullable = true)
| |-- Title: string (nullable = true)
| |-- Author: string (nullable = true)
| |-- Links: string (nullable = true)
| |-- AltUrl: string (nullable = true)
| |-- AltUrlAmp: string (nullable = true)
| |-- PubTimestamp: timestamp (nullable = true)
conda create --name dbconnect38 python=3.8
conda activate dbconnect38
pip install -U databricks-connect
Databricks Connect also requires you to have Java 8 installed. If you have multiple Java versions, you can manage them via your
profile. -
If you're on OS X and use Homebrew you can brew install Java 8
Brew Install Java Tutorial
brew cask install adoptopenjdk8
- make sure $JAVA_HOME and $JRE_HOME are pointing towards the Java 8 version.
export JAVA_HOME=/usr/local/Cellar/openjdk@8/1.8.0+282
export PATH=$JAVA_HOME/bin:$PATH
export JRE_HOME=$JAVA_HOME/jre
export PATH=$JAVA_HOME/bin:$JRE_HOME/bin:$PATH
Follow the official Databricks Connect Documentation to get get going, but first set up a conda env and install Java 8
- Databricks Connect allows you to connect your IDE to a Databricks cluster and run your code from your local environment (rather than through a Notebook). However, you will still need to connect to and mount your storage using a Notebook, after which code can be executed from your IDE.
- As of time of writing, the latest supported Databricks Runtime version is Databricks is 9.1 LTS ML, 9.1 LTS and it requires Python 3.8 to run. If you already have Python and Spark installed, you can simply create a new conda or venv environment built on Python 3.8, activate it, and then pip install Databricks Connect.
Databricks Runtime version | Python version |
9.1 LTS ML, 9.1 LTS | 3.8 |
7.3 LTS ML, 7.3 LTS | 3.7 |
6.4 ML, 6.4 | 3.7 |
The minor version of your client Python installation must be the same as the minor Python version of your Databricks cluster. The table shows the Python version installed with each Databricks Runtime.
- If you've previously set a
environment variable, you will need to comment it out and create a new one that points to Pyspark contained within the Databricks Connect environment you just created.
export SPARK_HOME=/Users/<user_name>/opt/anaconda3/envs/dbconnect38/lib/python3.8/site-packages/pyspark
Once you have the environment set you can continue to configure Databricks Connect by running the configuration command as explained in the Databricks Connect Documentation and supplying your cluster information and personal access token.
The credentials you enter will be saved to a hidden file called .databricks-connect
, located in your home folder.
If needed you can access and edit the file by opening a new finder window and navigating to the file directly:
Shift+Cmnd+G and enter ~/.databricks-connect
Alternatively you can simply navigate to your home folder and enter Shift+Cmnd+. to see all hidden files.
Using Azure Storage Explorer you can create the needed folders for running the pipeline.
You will manually add the config.toml
file to the config directory as well as the
script to the scripts directory.
The final directory stucture including needed files should appear as follows (with .
as the root of your blob container):
├── config
│  └── config.toml
├── gdelt
│  ├── download_metrics
│  ├── pipeline_metrics_final
│  ├── pipeline_metrics_temp
│  ├── raw_gkg
│  └── transformed_gkg
└── scripts
Microsoft ADLS Gen2 Service Principal Documentation
Note that you must set permissions to allow your service principal app to access ADLS Gen2 Storage
- Check the 'default' box to propogate access down to all folders and files contained within (before they are added to the directory)
The default ACL determines permissions for new children of this directory. Changing the default ACL does not affect children that already exist.
- If you're encountering 403 or 'Invalid CSFR Token' errors, this blog post is helpful in further describing the steps to authenticate as a service principal and set permissions on folders using Azure Storage Explorer.
Avoid 403 Errors Accessing ADLS Gen2 As Service Principal
Access Azure Data Lake Storage Gen2 using OAuth 2.0 with an Azure service principal
Mounting & Accessing ADLS Gen2 in Azure Databricks Using Service Principal and Secret Scopes
Secret Scopes
Azure Databricks Secret Scopes Documentation
# within Databricks Notebook
configs = {"": "OAuth",
"": "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
"": "<application_ID>",
"": dbutils.secrets.get(scope="gdelt-pipeline-secret-scope", key="gdelt-databricks-SECRET"),
"": "<tenant_ID>/oauth2/token",
"": "true"}
source = "abfss://<container_name>@<storage_acc_name>",
mount_point = "/mnt/",
extra_configs = configs)
To unmount your storage account:
# within Databricks Notebook
- Under the Advanced section of your cluster, edit settings to supply additional Spark and Hadoop configs:
spark.sql.session.timeZone Hongkong
spark.databricks.passthrough.enabled true true
spark.databricks.service.server.enabled true
spark.databricks.pyspark.enableProcessIsolation false<storage_acc_name><tenant_ID>/oauth2/token <application_ID><storage_acc_name> <AZURE_STORAGE_ACCESS_KEY><storage_acc_name> OAuth<storage_acc_name> org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider<storage_acc_name> <app_client_secret>
- Note that the
environment variables as seen in the picture are not currently needed and can be disregarded.
- Supply the path to the init script
In Azure Storage Explorer navigate to the scripts folder located at the root of the container and add
script. This will install all of the required Python packages on all nodes within your cluster.
DBFS file path:
Two libraries are needed to run the pipeline. Both are contained within the gdelt-gkg/dist
folder of the repository.
The first is the gdelt-gkg app containing all of the python modules: gdelt_gkg-1.0.0-py3-none-any.whl
The second is the cloudpathlib library. PyPi pip installs of cloudpathlib were not working so I downloaded the source code and ran a build. I have included the .whl file cloudpathlib-0.7.0-py3-none-any.whl
within the gdelt-gkg/dist
['GkgRecordId', 'V21Date', 'V2SrcCollectionId', 'V2SrcCmnName', 'V2DocId', 'V1Counts', 'V21Counts', 'V1Themes', 'V2EnhancedThemes', 'V1Locations', 'V2Locations', 'V1Persons', 'V2Persons', 'V1Orgs', 'V2Orgs', 'V15Tone', 'V21EnhancedDates', 'V2GCAM', 'V21ShareImg', 'V21RelImg', 'V21SocImage', 'V21SocVideo', 'V21Quotations', 'V21AllNames', 'V21Amounts', 'V21TransInfo', 'V2ExtrasXML']
Highlighting the denormalized structure of the data. Before transformation each CSV file contains 27 columns, many of which contain further nested data elements which are delimited differently depending on the column.
Expand to view raw. Scroll right to view further...
Expand to view raw. Scroll right to view further...
KILL#13##1#United States#US#US#39.828175#-98.5795#US;CRISISLEX_T03_DEAD#13##1#United States#US#US#39.828175#-98.5795#US;CRISISLEX_T03_DEAD#13##1#United States#US#US#39.828175#-98.5795#US;
KILL#13##1#United States#US#US#39.828175#-98.5795#US#1402;CRISISLEX_T03_DEAD#13##1#United States#US#US#39.828175#-98.5795#US#1402;CRISISLEX_T03_DEAD#13##1#United States#US#US#39.828175#-98.5795#US#1402;
arielle sheftall;sherrod brown;brown d-ohio
Arielle Sheftall,1587;Sherrod Brown,74
american academy of pediatrics;american hospital association;association of american medical colleges;academy of child
American Academy Of Pediatrics,953;American Hospital Association,989;Association Of American Medical Colleges,1036;Academy Of Child,2599
Sherrod Brown,78;Child Suicide Prevention,132;Lethal Means Safety,156;Democratic Congresswoman Lauren Underwood,514;Child Suicide Prevention,563;Lethal Means Safety,587;American Academy,992;American Hospital Association,1045;American Medical,1085;Arielle Sheftall,1661;Suicide Prevention,1726;Abigail Wexner Research Institute,1780;Nationwide Children,1803;Adolescent Psychology,2770
4,leading cause of death,1602;
<PAGE_AUTHORS>Lucas Bechtol;ltbechtol</PAGE_AUTHORS><PAGE_PRECISEPUBTIMESTAMP>20211020231500</PAGE_PRECISEPUBTIMESTAMP><PAGE_TITLE>Brown supporting child suicide prevention bill</PAGE_TITLE>