diff --git a/common/stringz/stringz.go b/common/stringz/stringz.go index 58d3ee40..6cc889ce 100644 --- a/common/stringz/stringz.go +++ b/common/stringz/stringz.go @@ -6,6 +6,7 @@ import ( "encoding/base64" "encoding/hex" "errors" + "mime" "net/http" "net/url" "strconv" @@ -156,3 +157,26 @@ func InsertInto(s string, interval int, sep rune) string { func Base64(bin []byte) string { return base64.StdEncoding.EncodeToString(bin) } + +func IsBase64Icon(iconBase64 string) bool { + if iconBase64 == "" { + return false + } + parts := strings.Split(strings.TrimPrefix(iconBase64, "data:"), ",") + if len(parts) != 2 { + return false + } + mediaType, _, _ := mime.ParseMediaType(parts[0]) + return strings.HasPrefix(mediaType, "image/") +} + +func DecodeBase64Icon(iconBase64 string) ([]byte, error) { + if iconBase64 == "" { + return nil, errors.New("empty base64 icon") + } + parts := strings.Split(iconBase64, ",") + if len(parts) != 2 { + return nil, errors.New("invalid base64 icon") + } + return base64.StdEncoding.DecodeString(strings.TrimSpace(parts[1])) +} diff --git a/runner/runner.go b/runner/runner.go index af3ffa22..78a7798f 100644 --- a/runner/runner.go +++ b/runner/runner.go @@ -2282,7 +2282,7 @@ func (r *Runner) HandleFaviconHash(hp *httpx.HTTPX, req *retryablehttp.Request, clone := req.Clone(context.Background()) var faviconMMH3, faviconMD5, faviconPath, faviconURL string - var faviconData []byte + var faviconData, faviconDecodedData []byte errCount := 0 if len(potentialURLs) == 0 && defaultProbe { potentialURLs = append(potentialURLs, "/favicon.ico") @@ -2293,30 +2293,46 @@ func (r *Runner) HandleFaviconHash(hp *httpx.HTTPX, req *retryablehttp.Request, if errCount == 2 { break } - URL, err := r.parseURL(potentialURL) - if err != nil { - continue + URL, err := urlutil.ParseURL(potentialURL, r.options.Unsafe) + + isFavUrl, isBase64FavIcon := err == nil, false + if !isFavUrl { + isBase64FavIcon = stringz.IsBase64Icon(potentialURL) } - if URL.IsAbs() { - clone.SetURL(URL) - clone.Host = URL.Host - potentialURL = "" - } else { - potentialURL = URL.String() + + if !isFavUrl && !isBase64FavIcon { + continue } - if potentialURL != "" { - err = clone.MergePath(potentialURL, false) + if isFavUrl { + if URL.IsAbs() { + clone.SetURL(URL) + clone.Host = URL.Host + potentialURL = "" + } else { + potentialURL = URL.String() + } + + if potentialURL != "" { + err = clone.UpdateRelPath(potentialURL, false) + if err != nil { + continue + } + } + resp, err := hp.Do(clone, httpx.UnsafeOptions{}) if err != nil { + errCount++ continue } + faviconDecodedData = resp.Data } - resp, err := hp.Do(clone, httpx.UnsafeOptions{}) - if err != nil { - errCount++ - continue + // if the favicon is base64 encoded, decode before hashing + if isBase64FavIcon { + if faviconDecodedData, err = stringz.DecodeBase64Icon(potentialURL); err != nil { + continue + } } - MMH3Hash, MD5Hash, err := r.calculateFaviconHashWithRaw(resp.Data) + MMH3Hash, MD5Hash, err := r.calculateFaviconHashWithRaw(faviconDecodedData) if err != nil { continue } @@ -2324,7 +2340,7 @@ func (r *Runner) HandleFaviconHash(hp *httpx.HTTPX, req *retryablehttp.Request, faviconPath = potentialURL faviconMMH3 = MMH3Hash faviconMD5 = MD5Hash - faviconData = resp.Data + faviconData = faviconDecodedData break } return faviconMMH3, faviconMD5, faviconPath, faviconData, faviconURL, nil