Skip to content

Commit

Permalink
Normalize column names
Browse files Browse the repository at this point in the history
This commit normalizes the column names so that they are lowercased and
have underscores instead of dashes. Hopefully it's not disruptive for
existing uses of warcdb!
  • Loading branch information
edsu committed Oct 20, 2023
1 parent af8d544 commit bfde7cd
Show file tree
Hide file tree
Showing 3 changed files with 84 additions and 72 deletions.
13 changes: 13 additions & 0 deletions tests/test_warcdb.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from click.testing import CliRunner
from warcdb import warcdb_cli
import os
import re
import pathlib
import pytest
import sqlite_utils
Expand Down Expand Up @@ -40,3 +41,15 @@ def test_import(warc_path):
assert db.table('request').get('<urn:uuid:524F62DD-D788-4085-B14D-22B0CDC0AC53>')

os.remove(db_file)


def test_column_names():
runner = CliRunner()
runner.invoke(warcdb_cli, ['init', db_file])
runner.invoke(warcdb_cli, ["import", db_file, str(pathlib.Path('tests/google.warc'))])

# make sure that the columns are named correctly (lowercase with underscores)
db = sqlite_utils.Database(db_file)
for table in db.tables:
for col in table.columns:
assert re.match(r'^[a-z_]+', col.name), f'column {col.name} named correctly'
33 changes: 16 additions & 17 deletions warcdb/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,7 @@ def record_payload(self: ArcWarcRecord):
@cache
def record_as_dict(self: ArcWarcRecord):
"""Method to easily represent a record as a dict, to be fed into db_utils.Database.insert()"""

return dict(self.rec_headers.headers)
return {k.lower().replace('-', '_'): v for k, v in self.rec_headers.headers}


setattr(ArcWarcRecord, 'as_dict', record_as_dict)
Expand Down Expand Up @@ -135,14 +134,14 @@ def __iadd__(self, r: ArcWarcRecord):
====
* For all rec_types: also store WARC/1.0 field (warc and version?)
* Todo pass conversions: {'Content-Length': int, WARC-Date: datet
* Todo pass conversions: {'Content-Length': int, warc-date: datet
* All 'response', 'resource', 'request', 'revisit', 'conversion' and 'continuation' records may have a payload.
All 'warcinfo' and 'metadata' records shall not have a payload.
"""
col_type_conversions = {
'Content-Length': int,
'content_length': int,
'payload': str,
'WARC-Date': datetime.datetime,
'warc_date': datetime.datetime,

}
record_dict = r.as_dict()
Expand All @@ -161,15 +160,15 @@ def __iadd__(self, r: ArcWarcRecord):
if r.rec_type == 'warcinfo':

self.db.table('warcinfo').insert(record_dict,
pk='WARC-Record-ID',
pk='warc_record_id',
alter=True,
ignore=True,
columns=col_type_conversions)
elif r.rec_type == 'request':
self.db.table('request').insert(record_dict,
pk='WARC-Record-ID',
pk='warc_record_id',
foreign_keys=[
("WARC-Warcinfo-ID", "warcinfo", "WARC-Record-ID")
("warc_warcinfo_id", "warcinfo", "warc-record-id")
],
alter=True,
ignore=True,
Expand All @@ -178,10 +177,10 @@ def __iadd__(self, r: ArcWarcRecord):

elif r.rec_type == 'response':
self.db.table('response').insert(record_dict,
pk='WARC-Record-ID',
pk='warc_record_id',
foreign_keys=[
("WARC-Warcinfo-ID", "warcinfo", "WARC-Record-ID"),
("WARC-Concurrent-To", "request", "WARC-Record-ID")
("warc_warcinfo_id", "warcinfo", "warc_record_id"),
("warc_concurrent_to", "request", "warc_record_id")
],
alter=True,
ignore=True,
Expand All @@ -190,10 +189,10 @@ def __iadd__(self, r: ArcWarcRecord):

elif r.rec_type == 'metadata':
self.db.table('metadata').insert(record_dict,
pk='WARC-Record-ID',
pk='warc_record_id',
foreign_keys=[
("WARC-Warcinfo-ID", "warcinfo", "WARC-Record-ID"),
("WARC-Concurrent-To", "response", "WARC-Record-ID")
("warc-warcinfo-id", "warcinfo", "warc_record_id"),
("warc_concurrent_to", "response", "warc_record_id")
],
alter=True,
ignore=True,
Expand All @@ -202,10 +201,10 @@ def __iadd__(self, r: ArcWarcRecord):

elif r.rec_type == 'resource':
self.db.table('resource').insert(record_dict,
pk='WARC-Record-ID',
pk='warc_record_id',
foreign_keys=[
("WARC-Warcinfo-ID", "warcinfo", "WARC-Record-ID"),
("WARC-Concurrent-To", "metadata", "WARC-Record-ID")
("warc-warcinfo-id", "warcinfo", "warc_record_id"),
("warc_concurrent_to", "metadata", "warc_record_id")
],
alter=True,
ignore=True,
Expand Down
110 changes: 55 additions & 55 deletions warcdb/migrations.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,91 +7,91 @@
def m001_initial(db):
db["warcinfo"].create(
{
"WARC-Type": str,
"Content-Type": str,
"WARC-Date": str,
"WARC-Record-ID": str,
"WARC-Filename": str,
"WARC-Block-Digest": str,
"Content-Length": int,
"warc_type": str,
"content_type": str,
"warc_date": str,
"warc_record_id": str,
"warc_filename": str,
"warc_block_digest": str,
"content_length": int,
"payload": str,
},
pk="WARC-Record-ID",
pk="warc_record_id",
)

db["request"].create(
{
"WARC-Type": str,
"WARC-Target-URI": str,
"Content-Type": str,
"WARC-Date": str,
"WARC-Record-ID": str,
"WARC-IP-Address": str,
"WARC-Warcinfo-ID": str,
"WARC-Block-Digest": str,
"Content-Length": int,
"warc_type": str,
"warc_target_uri": str,
"content_type": str,
"warc_date": str,
"warc_record_id": str,
"warc_ip_address": str,
"warc_warcinfo_id": str,
"warc_block_digest": str,
"content_length": int,
"payload": str,
"http_headers": str,
},
pk="WARC-Record-ID",
foreign_keys=[("WARC-Warcinfo-ID", "warcinfo", "WARC-Record-ID")],
pk="warc_record_id",
foreign_keys=[("warc_warcinfo_id", "warcinfo", "warc_record_id")],
)

db["response"].create(
{
"WARC-Type": str,
"WARC-Record-ID": str,
"WARC-Warcinfo-ID": str,
"WARC-Concurrent-To": str,
"WARC-Target-URI": str,
"WARC-Date": str,
"WARC-IP-Address": str,
"WARC-Block-Digest": str,
"WARC-Payload-Digest": str,
"Content-Type": str,
"Content-Length": int,
"warc_type": str,
"warc_record_id": str,
"warc_warcinfo_id": str,
"warc_concurrent_to": str,
"warc_target_uri": str,
"warc_date": str,
"warc_ip_address": str,
"warc_block_digest": str,
"warc_payload_digest": str,
"content_type": str,
"content_length": int,
"payload": str,
"http_headers": str,
},
pk="WARC-Record-ID",
pk="warc_record_id",
foreign_keys=[
("WARC-Warcinfo-ID", "warcinfo", "WARC-Record-ID"),
("WARC-Concurrent-To", "request", "WARC-Record-ID"),
("warc_warcinfo_id", "warcinfo", "warc_record_id"),
("warc_concurrent_to", "request", "warc_record_id"),
],
)

db["metadata"].create(
{
"WARC-Type": str,
"WARC-Record-ID": str,
"WARC-Warcinfo-ID": str,
"WARC-Target-URI": str,
"WARC-Date": str,
"WARC-Block-Digest": str,
"Content-Type": str,
"Content-Length": int,
"warc_type": str,
"warc_record_id": str,
"warc_warcinfo_id": str,
"warc_target_uri": str,
"warc_date": str,
"warc_block_digest": str,
"content_type": str,
"content_length": int,
"payload": str,
},
pk="WARC-Record-ID",
foreign_keys=[("WARC-Warcinfo-ID", "warcinfo", "WARC-Record-ID")],
pk="warc_record_id",
foreign_keys=[("warc_warcinfo_id", "warcinfo", "warc_record_id")],
)

db["resource"].create(
{
"WARC-Type": str,
"WARC-Record-ID": str,
"WARC-Warcinfo-ID": str,
"WARC-Concurrent-To": str,
"WARC-Target-URI": str,
"WARC-Date": str,
"WARC-Block-Digest": str,
"Content-Type": str,
"Content-Length": int,
"warc_type": str,
"warc_record_id": str,
"warc_warcinfo_id": str,
"warc_concurrent_to": str,
"warc_target_uri": str,
"warc_date": str,
"warc_block_digest": str,
"content_type": str,
"content_length": int,
"payload": str,
},
pk="WARC-Record-ID",
pk="warc_record_id",
foreign_keys=[
("WARC-Warcinfo-ID", "warcinfo", "WARC-Record-ID"),
("WARC-Concurrent-To", "metadata", "WARC-Record-ID"),
("warc_warcinfo_id", "warcinfo", "warc_record_id"),
("warc_concurrent_to", "metadata", "warc_record_id"),
],
)

0 comments on commit bfde7cd

Please sign in to comment.