Skip to content

Commit

Permalink
Dont leak ip address (#310)
Browse files Browse the repository at this point in the history
* Revert "Remove urllib3 dependency"

This reverts commit 0911477.

urllib3 is needed to postprocess/parse/URL for sanitization and privacy purpose (#192)

* IA currently leaks the IP address of the submitter. This is bad.

We fix this by carefully redacting the IP address in the JSON fields known to contain it.

* added tests

* flake8 linting
  • Loading branch information
drzraf committed Sep 18, 2023
1 parent 2655580 commit 9c2ae72
Show file tree
Hide file tree
Showing 4 changed files with 63 additions and 2 deletions.
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
},
install_requires=[
'internetarchive',
'urllib3==1.26.13',
'docopt==0.6.2',
'yt-dlp',
]
Expand Down
20 changes: 19 additions & 1 deletion tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
import unittest
import os
from tubeup.utils import sanitize_identifier, check_is_file_empty
import json
from tubeup.utils import sanitize_identifier, check_is_file_empty, strip_ip_from_meta

current_path = os.path.dirname(os.path.realpath(__file__))


def get_testfile_path(name):
return os.path.join(current_path, 'test_tubeup_files', name)


class UtilsTest(unittest.TestCase):
Expand Down Expand Up @@ -48,3 +55,14 @@ def test_check_is_file_empty_when_file_doesnt_exist(self):
FileNotFoundError,
r"^Path 'file_that_doesnt_exist.txt' doesn't exist$"):
check_is_file_empty('file_that_doesnt_exist.txt')

def test_strip_ip_from_meta(self):
with open(get_testfile_path(
'Mountain_3_-_Video_Background_HD_1080p-6iRV8liah8A.'
'info.json')
) as f:
vid_meta = json.load(f)
mod, new_meta = strip_ip_from_meta(vid_meta)
self.assertTrue(mod)
self.assertNotEqual(f.read(), json.dumps(new_meta))
self.assertNotRegex(json.dumps(new_meta), r'36\.73\.93\.234')
7 changes: 6 additions & 1 deletion tubeup/TubeUp.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from internetarchive.config import parse_config_file
from datetime import datetime
from yt_dlp import YoutubeDL
from .utils import (get_itemname, check_is_file_empty,
from .utils import (get_itemname, check_is_file_empty, strip_ip_from_meta,
EMPTY_ANNOTATION_FILE)
from logging import getLogger
from urllib.parse import urlparse
Expand Down Expand Up @@ -324,6 +324,11 @@ def upload_ia(self, videobasename, custom_meta=None):
with open(json_metadata_filepath, 'r', encoding='utf-8') as f:
vid_meta = json.load(f)

mod, new_meta = strip_ip_from_meta(vid_meta)
if mod:
with open(json_metadata_filepath, 'w') as f:
json.dump(new_meta, f)

# Exit if video download did not complete, don't upload .part files to IA
for ext in ['*.part', '*.f303.*', '*.f302.*', '*.ytdl', '*.f251.*', '*.248.*', '*.f247.*', '*.temp']:
if glob.glob(videobasename + ext):
Expand Down
37 changes: 37 additions & 0 deletions tubeup/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
import re
from urllib.parse import urlparse, parse_qs, urlencode


EMPTY_ANNOTATION_FILE = ('<?xml version="1.0" encoding="UTF-8" ?>'
Expand Down Expand Up @@ -29,3 +30,39 @@ def check_is_file_empty(filepath):
return os.stat(filepath).st_size == 0
else:
raise FileNotFoundError("Path '%s' doesn't exist" % filepath)


def strip_ip_from_url(url):
"""
Strip occurence of IP address as found in path segments like in /ip/1.2.3.4/
or in an "ip" query-parameter, like in ?ip=1.2.3.4
"""
u = urlparse(url)
u = u._replace(path=re.sub(r'/ip/[^/]+', r'/ip/REDACTED', u.path))
if u.query != '':
qs = parse_qs(u.query)
try:
del (qs['ip'])
u = u._replace(query=urlencode(qs, True))
except KeyError:
pass
return u.geturl()


def strip_ip_from_meta(meta):
modified = False
if 'url' in meta:
redacted_url = strip_ip_from_url(meta['url'])
if redacted_url != meta['url']:
meta['url'] = redacted_url
modified = True

for _format in meta['formats']:
for field in ['manifest_url', 'fragment_base_url', 'url']:
if field in _format:
redacted_url = strip_ip_from_url(_format[field])
if redacted_url != _format[field]:
_format[field] = redacted_url
modified = True

return modified, meta

0 comments on commit 9c2ae72

Please sign in to comment.