-
Notifications
You must be signed in to change notification settings - Fork 25
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[Commoncrawl pipeline] Add component extract free-to-use images (#282)
This 3rd component extracts the image url, alt text and license metadata from the webpage url and html code.
- Loading branch information
1 parent
be7bd0b
commit b8bfaef
Showing
7 changed files
with
303 additions
and
0 deletions.
There are no files selected for viewing
18 changes: 18 additions & 0 deletions
18
examples/pipelines/commoncrawl/components/extract_image_licenses/Dockerfile
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
FROM --platform=linux/amd64 python:3.8-slim | ||
|
||
## System dependencies | ||
RUN apt-get update && \ | ||
apt-get upgrade -y && \ | ||
apt-get install git -y | ||
|
||
# install requirements | ||
COPY requirements.txt / | ||
RUN pip3 install --no-cache-dir -r requirements.txt | ||
|
||
# Set the working directory to the component folder | ||
WORKDIR /component/src | ||
|
||
# Copy over src-files | ||
COPY src/ . | ||
|
||
ENTRYPOINT ["python", "main.py"] |
8 changes: 8 additions & 0 deletions
8
examples/pipelines/commoncrawl/components/extract_image_licenses/README.md
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
# extract_image_licenses | ||
|
||
### Description | ||
This components extracts image url and license metadata from a dataframe of webpage url and html code. | ||
|
||
### **Inputs/Outputs** | ||
|
||
See [`fondant_component.yaml`](fondant_component.yaml) for a more detailed description on all the input/output parameters. |
25 changes: 25 additions & 0 deletions
25
examples/pipelines/commoncrawl/components/extract_image_licenses/fondant_component.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
name: Extract image url and license from commoncrawl | ||
description: Component that extracts image url and license metadata from a dataframe of webpage urls and html codes | ||
image: ghcr.io/ml6team/extract_image_licenses:latest | ||
|
||
consumes: | ||
webpage: | ||
fields: | ||
url: | ||
type: string | ||
html: | ||
type: string | ||
|
||
produces: | ||
image: | ||
fields: | ||
image_url: | ||
type: string | ||
alt_text: | ||
type: string | ||
webpage_url: | ||
type: string | ||
license_type: | ||
type: string | ||
license_location: | ||
type: string |
2 changes: 2 additions & 0 deletions
2
examples/pipelines/commoncrawl/components/extract_image_licenses/requirements.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
beautifulsoup4==4.12.2 | ||
git+https://github.com/ml6team/fondant@main |
88 changes: 88 additions & 0 deletions
88
examples/pipelines/commoncrawl/components/extract_image_licenses/src/main.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
import re | ||
import logging | ||
|
||
import pandas as pd | ||
from bs4 import BeautifulSoup | ||
from typing import List | ||
|
||
from fondant.component import PandasTransformComponent | ||
|
||
from utils.license_utils import get_license_type, get_license_location | ||
from utils.image_utils import get_images_from_soup, get_unique_images | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
def get_image_info_from_webpage(webpage_url: str, webpage_html: str) -> List[List[str]]: | ||
"""Extracts image urls and license metadata from the parsed html code. | ||
Args: | ||
webpage_url: The url of the webpage. | ||
webpage_html: The html content of the webpage. | ||
Returns: | ||
A list of image urls and license metadata. | ||
""" | ||
|
||
try: | ||
soup = BeautifulSoup(webpage_html, "html.parser") | ||
for a_tag in soup.find_all("a"): | ||
if a_tag.has_attr("href"): | ||
license_type = get_license_type(a_tag) | ||
if license_type is not None: | ||
license_location = get_license_location(a_tag) | ||
|
||
if license_location is None: | ||
continue | ||
logger.info( | ||
f"Found license type: {license_type} at {license_location}" | ||
) | ||
images = get_images_from_soup( | ||
soup, webpage_url, license_type, license_location | ||
) | ||
logger.info(f"Found {len(images)} images.") | ||
|
||
unique_images = get_unique_images(images) | ||
logger.info(f"Found {len(unique_images)} unique images.") | ||
|
||
return unique_images | ||
|
||
except Exception as e: | ||
logger.error(f"Error parsing HTML: {e}") | ||
return None | ||
|
||
|
||
class ExtractImageLicenses(PandasTransformComponent): | ||
def transform(self, df: pd.DataFrame) -> pd.DataFrame: | ||
"""Extracts image url and license from the HTML content. | ||
Args: | ||
df: A pandas dataframe with the webpage url and html content. | ||
Returns: | ||
A pandas dataframe with the image url and license metadata. | ||
""" | ||
df = ( | ||
df.apply( | ||
lambda row: get_image_info_from_webpage( | ||
row[("webpage", "url")], row[("webpage", "html")] | ||
), | ||
axis=1, | ||
result_type="expand", | ||
) | ||
.explode(0) | ||
.apply(pd.Series) | ||
) | ||
|
||
df = df.dropna() | ||
|
||
df.columns = [ | ||
("image", "image_url"), | ||
("image", "alt_text"), | ||
("image", "webpage_url"), | ||
("image", "license_type"), | ||
("image", "license_location"), | ||
] | ||
|
||
return df | ||
|
||
|
||
if __name__ == "__main__": | ||
component = ExtractImageLicenses.from_args() | ||
component.run() |
83 changes: 83 additions & 0 deletions
83
examples/pipelines/commoncrawl/components/extract_image_licenses/src/utils/image_utils.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
import logging | ||
from urllib.parse import urlparse | ||
from typing import Any, List | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
def get_full_image_url(image_url: str, webpage_url: str) -> str: | ||
"""Returns the full image url if not already provided. | ||
Args: | ||
image_url: The image url. | ||
webpage_url: The url of the webpage. | ||
Returns: | ||
The full image url. | ||
""" | ||
if "http" not in image_url or image_url[0] == "/": | ||
parsed_webpage_url = urlparse(webpage_url) | ||
image_url = ( | ||
f"{parsed_webpage_url.scheme}://{parsed_webpage_url.netloc}{image_url}" | ||
) | ||
|
||
try: | ||
pos = image_url.index("?") | ||
image_url = image_url[:pos] | ||
except: | ||
pass | ||
|
||
return image_url | ||
|
||
|
||
def get_image_info( | ||
a_tag: Any, webpage_url: str, license_type: str, license_location: str | ||
) -> List[str]: | ||
"""Returns the image url, alt text, webpage url, and license type. | ||
Args: | ||
a_tag: The parsed html code. | ||
webpage_url: The url of the webpage. | ||
license_type: The license type. | ||
Returns: | ||
A list of image url, alt text, webpage url, and license type. | ||
""" | ||
img_tag = a_tag.find("img") | ||
|
||
if img_tag and img_tag.has_attr("src"): | ||
img_src = get_full_image_url(img_tag["src"], webpage_url) | ||
img_alt = img_tag.get("alt", "") | ||
return [img_src, img_alt, webpage_url, license_type, license_location] | ||
|
||
return None | ||
|
||
|
||
def get_images_from_soup( | ||
soup: Any, webpage_url: str, license_type: str, license_location: str | ||
) -> List[List[str]]: | ||
"""Returns a list of image urls from the parsed html code. | ||
Args: | ||
soup: The parsed html code. | ||
webpage_url: The url of the webpage. | ||
license_type: The license type. | ||
Returns: | ||
A list of image urls.""" | ||
image_info = [] | ||
for a_tag in soup.find_all("a"): | ||
img_info = get_image_info(a_tag, webpage_url, license_type, license_location) | ||
if img_info: | ||
image_info.append(img_info) | ||
|
||
logger.info(f"Found {len(image_info)} images.") | ||
return image_info | ||
|
||
|
||
def get_unique_images(images: List[List[str]]) -> List[List[str]]: | ||
"""Returns a list of unique images. | ||
Args: | ||
images: A list of images. | ||
Returns: | ||
A list of unique images. | ||
""" | ||
unique_images = [] | ||
for image in images: | ||
if image not in unique_images: | ||
unique_images.append(image) | ||
return unique_images |
79 changes: 79 additions & 0 deletions
79
examples/pipelines/commoncrawl/components/extract_image_licenses/src/utils/license_utils.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
import re | ||
import logging | ||
from typing import Any | ||
|
||
from urllib.parse import urlparse | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
def get_license_location(element: Any) -> str: | ||
"""Returns the license location from the parsed html code. | ||
Args: | ||
element: The parsed html code. | ||
Returns: | ||
The license location. | ||
""" | ||
parent = element.parent | ||
|
||
if parent is None: # could not find an apprioriate tag | ||
return None | ||
|
||
if ( | ||
parent.name == "footer" | ||
or parent.find("div", id="footer") | ||
or parent.find("div", class_="footer") | ||
): | ||
return "footer" | ||
elif ( | ||
parent.name == "aside" | ||
or parent.find("div", id="aside") | ||
or parent.find("div", class_="aside") | ||
): | ||
return "aside" | ||
elif ( | ||
parent.name == "sidebar" | ||
or parent.find("div", id="sidebar") | ||
or parent.find("div", class_="sidebar") | ||
): | ||
return "sidebar" | ||
else: | ||
return get_license_location(parent) | ||
|
||
|
||
def get_license_type_from_creative_commons_url(license_url: str) -> str: | ||
"""Returns the license type from the creative commons url. | ||
Args: | ||
license_url: The creative commons url. | ||
Returns: | ||
The license type. | ||
""" | ||
license_split = urlparse(license_url).path.split("/") | ||
logger.info(f"license_split: {license_split}") | ||
|
||
if "publicdomain" in license_split: | ||
return "public domain" | ||
else: | ||
license = [l for l in license_split if "by" in l] | ||
return license[0] | ||
|
||
|
||
def get_license_type_from_fandom_url(a_tag: Any) -> str: | ||
return a_tag.text | ||
|
||
|
||
def get_license_type(a_tag: Any) -> str: | ||
"""Returns the license type from the parsed html code. | ||
Args: | ||
a_tag: The parsed html code. | ||
Returns: | ||
The license type. | ||
""" | ||
href = a_tag.get("href") | ||
|
||
if "fandom.com/licensing" in href: | ||
return get_license_type_from_fandom_url(a_tag) | ||
elif "creativecommons.org" in href: | ||
return get_license_type_from_creative_commons_url(href) | ||
else: | ||
return None |