jakewilliami · jakewilliami · Jun 5, 2022 · Nov 4, 2021 · Nov 4, 2021 · Nov 7, 2021
diff --git a/.gitignore b/.gitignore
@@ -31,4 +31,25 @@ docs/site/
 Manifest.toml
 
 # output data
-data/
+examples/data/
+data/*.csv
+data/*.pdf
+data/all-non-faces/
+data/alt/
+data/classifiers_*
+data/faceness-scores-*
+data/haarcascades
+data/lfw-all
+data/lizzie-testset
+data/main
+data/scores
+data/wider
+data/ffhq/thumbnails128x128/
+data/ffhq/LICENSE.txt
+data/ffhq/*.py
+data/ffhq/*.json
+data/things/object_images/
+data/things/object_images_all/
+data/things/password.txt
+data/things/all_categories.txt
+data/things/all_categories_filtered.txt
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "FaceDetection"
 uuid = "00808967-75e2-4046-a522-2ca211e35506"
 authors = ["Jake W. Ireland <jakewilliami@icloud.com> and contributors"]
-version = "1.0.2"
+version = "1.1.0"
 
 [deps]
 ColorTypes = "3da002f7-5984-5a60-b8a6-cbb66c0b333f"
@@ -11,6 +11,7 @@ ImageIO = "82e4d734-157c-48bb-816b-45c225c6df19"
 ImageMagick = "6218d12a-5da1-5696-b52f-db25d2ecc6d1"
 ImageView = "86fae568-95e7-573e-a6b2-d8a6b900c9ef"
 Images = "916415d5-f1e6-5110-898d-aaa5f9f070e0"
+IntegralArrays = "1d092043-8f09-5a30-832f-7509e371ab51"
 Netpbm = "f09324ee-3d7c-5217-9330-fc30815ba969"
 ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
 QuartzImageIO = "dca85d43-d64c-5e67-8c65-017450d5d020"

diff --git a/data/ffhq/readme.md b/data/ffhq/readme.md
@@ -0,0 +1,6 @@
+The [FFHQ database](https://github.com/NVlabs/ffhq-dataset/) is a great dataset for positive training images, as it has some 70,001 images of faces, mostly alone in the image.
+
+To download this dataset, please run
+```shell
+$ bash setup.sh
+```
diff --git a/data/ffhq/setup.sh b/data/ffhq/setup.sh
@@ -0,0 +1,22 @@
+echo "Please ensure you have followed the Google Drive API instructions listed here: https://docs.iterative.ai/PyDrive2/quickstart/"
+sleep 5
+
+pip3 install pydrive2
+curl 'https://gist.githubusercontent.com/jakewilliami/6e361ca59df521c874a9021bde1d2c81/raw/2f277c36bcd725df71d30174e13f920d7bee7b97/download_ffhq_pydrive.py'  > download_ffhq_pydrive.py
+echo "Downloading image thumbnails"
+python3 download_ffhq.py -t --pydrive --cmd_auth
+
+echo "Moving the images into one directory and deleting subdirectories."
+# move images out of their subdirectories
+for d in thumbnails128x128/*; do
+	[ -d "$d" ] || continue
+	for f in "$d"/*; do
+		mv "$f" "thumbnails128x128/$(basename "$f")"
+	done
+done
+# clean up the subdirectories
+for d in thumbnails128x128/*; do
+	if [ -d "$d" ]; then
+		rm -d "$d"
+	done
+done
diff --git a/data/things/misc_filter_categories.txt b/data/things/misc_filter_categories.txt
@@ -0,0 +1,83 @@
+baby
+bandanna
+beanie
+beard
+blindfold
+bobsled
+bowler hat
+braid
+breathalyzer
+chick
+chicken2
+chihuahua
+cockatoo
+costume
+dalmatian
+denture
+doll
+duckling
+ear
+earplug
+eye
+eye patch
+eyeliner
+face
+figurine
+football helmet
+gargoyle
+gas mask
+gingerbread man
+girl
+glasses
+goggles
+gondola
+groundhog
+hair
+hairnet
+hat
+headband
+headdress
+headlamp
+headscarf
+hearing aid
+helmet
+hood
+jetski
+kitten
+lamb
+man
+mannequin
+mascara
+mask
+mouth
+mouthpiece
+mustache
+piggy bank
+piglet
+playpen
+pogo stick
+poodle
+poster
+pug
+puppet
+puppy
+racehorse
+ram
+rickshaw
+robot
+sarcophagus
+scarecrow
+scarf
+seagull
+seal
+skeleton
+skull
+snorkel
+snowman
+statue
+tadpole
+teddy bear
+totem pole
+toy
+warthog
+woman
diff --git a/data/things/object_categories.jl b/data/things/object_categories.jl
@@ -0,0 +1,79 @@
+get_category_from_image_name(s::String) = join(split(basename(s), '_')[1:(end - 1)], ' ')
+
+# Return a list of object categories from the images
+function get_object_categories(object_images::Vector{String})
+    object_categories = String[]
+    for object_image in object_images
+        object_image = basename(object_image)
+        object_category = get_category_from_image_name(object_image)
+        if object_category ∉ object_categories
+            push!(object_categories, object_category)
+        end
+    end
+    return object_categories
+end
+get_object_categories(object_image_dir::String) = 
+    get_object_categories(readdir(object_image_dir))
+
+# Filter out animals from the categories
+function filter_out_animals(object_image_categories::Vector{String})
+    animals = readlines(download("https://gist.githubusercontent.com/atduskgreg/3cf8ef48cb0d29cf151bedad81553a54/raw/82f142562cf50b0f6fb8010f890b2f934093553e/animals.txt"))
+    animals = String[string(lowercase(animal)) for animal in animals]
+    filtered_categories = String[]
+    for image_category in object_image_categories
+        category_is_animal = image_category ∈ animals
+        # category_starts_with_animal = any(startswith(image_category, animal) for animal in animals)
+        if !category_is_animal # || !category_starts_with_animal
+            push!(filtered_categories, image_category)
+        end
+    end
+    return filtered_categories
+end
+filter_out_animals(object_image_dir::String) = 
+    filter_out_animals(get_object_categories(object_image_dir))
+
+# Get the category lists and write them to file
+function main(all_object_image_dir::String)
+    outfile_all_categories_list = "all_categories.txt"
+    outfile_all_categories_filtered_list = "all_categories_filtered.txt"
+    misc_filter_categories_list = "misc_filter_categories.txt"
+
+    all_object_images = readdir(all_object_image_dir, sort = true, join = true)
+
+    all_categories = get_object_categories(all_object_images)
+    all_categories_filtered = filter_out_animals(all_categories)
+    misc_filter_categories = readlines(misc_filter_categories_list)
+    filter!(category -> category ∉ misc_filter_categories, all_categories_filtered)
+
+    open(outfile_all_categories_list, "w") do io
+        for category in all_categories
+            write(io, category, '\n')
+        end
+    end
+
+    open(outfile_all_categories_filtered_list, "w") do io
+        for category in all_categories_filtered
+            write(io, category, '\n')
+        end
+    end
+
+    @info "There are currently $(length(all_object_images)) images in your object directory"
+    categories_warned = String[]
+    removed = 0
+    for object_image in all_object_images
+        object_category = get_category_from_image_name(object_image)
+        if object_category ∉ all_categories_filtered
+            if object_category ∉ categories_warned
+                @warn("Removing images of the category \"$object_category\"")
+                push!(categories_warned, object_category)
+            end
+            rm(object_image)
+            removed += 1
+        end
+    end
+    @info "We have removed all of the images that needed removing, and are left with $(length(all_object_images) - removed) images in your object directory"
+
+    return nothing
+end
+
+main("object_images/")
diff --git a/data/things/readme.md b/data/things/readme.md
@@ -0,0 +1,22 @@
+The [THINGS dataset](https://osf.io/3fu6z/) is a great dataset for object images, containing 26,107 object images.  However, there are some categories of images that may interfere with our face detection results, if we are to use these images as negative training images.  Of these images, there are 1854 unique categories.  After filtering out [animals](https://gist.github.com/atduskgreg/3cf8ef48cb0d29cf151bedad81553a54) from this dataset, there are 1702 unique categories.  Further removing some categories (manually selected) that contained humans or facial features (see below), there are 1619 unique categories.
+
+To download the THINGS dataset in its entirety, run
+```shell
+$ bash setup.sh
+```
+
+Now that you have the dataset, please run
+```shell
+$ julia object_categories.jl
+```
+
+This will create two text files; one will have all unique categories of images (`all_categories.txt`); the other will contain that list (`all_categories_filtered.txt`), removing categories that are:
+  - Animals;
+  - Hat or hair related objects;
+  - Human-like objects;
+  - Specific parts of faces;
+  - Activities requiring humans.
+
+The Julia script will filter these categories out of the downloaded images, as they contain too many faces/facial features.  Beyond animals, this filter process uses a list of categories manually selected from `misc_filter_categories.txt`.
+
+After filtering all the potentially interfering images out of the THINGS dataset, we are left with 22,558 images.
diff --git a/data/things/setup.sh b/data/things/setup.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+wget -q 'https://files.osf.io/v1/resources/jum2f/providers/osfstorage/5d4d7ec80f488d0017907d30?action=download&direct&version=2' -O 'password.txt'
+echo "Downloading object_images_A-C.zip"
+wget 'https://files.osf.io/v1/resources/jum2f/providers/osfstorage/5f89eef1d85b700286657a33?action=download&direct&version=1' -O 'object_images_A-C.zip'
+echo "Downloading object_images_D-K.zip"
+wget 'https://files.osf.io/v1/resources/jum2f/providers/osfstorage/5f89f02b37b6bb0248309053?action=download&direct&version=1' -O 'object_images_D-K.zip'
+echo "Downloading object_images_L-Q.zip"
+wget 'https://files.osf.io/v1/resources/jum2f/providers/osfstorage/5f89f10e37b6bb02483092bb?action=download&direct&version=2' -O 'object_images_L-Q.zip'
+echo "Downloading object_images_R-S.zip"
+wget 'https://files.osf.io/v1/resources/jum2f/providers/osfstorage/5f89f218d85b700291656821?action=download&direct&version=1' -O 'object_images_R-S.zip'
+echo "Downloading object_images_T-Z.zip"
+wget 'https://files.osf.io/v1/resources/jum2f/providers/osfstorage/5f89f30a37b6bb02483098c8?action=download&direct&version=1' -O 'object_images_T-Z.zip'
+
+mkdir object_images
+for z in ./*.zip; do
+	unzip -P 'things4all' "$z"
+end
+
+for d in ./object_images_*; do
+	[ -d "$d" ] || continue
+	for d2 in "$d"/*; do
+		for f in "$d2"/*; do
+			mv "$f" ./object_images/"$(basename "$f")"
+		done
+   	done
+	rm -d "$d"
+done