gocolly · asciimoo · Mar 27, 2024 · Mar 25, 2024 · Mar 25, 2024
diff --git a/colly.go b/colly.go
@@ -1117,9 +1117,27 @@ func (c *Collector) handleOnResponseHeaders(r *Response) {
 }
 
 func (c *Collector) handleOnHTML(resp *Response) error {
-	if len(c.htmlCallbacks) == 0 || !strings.Contains(strings.ToLower(resp.Headers.Get("Content-Type")), "html") {
+	if len(c.htmlCallbacks) == 0 {
 		return nil
 	}
+
+	contentType := resp.Headers.Get("Content-Type")
+	if contentType == "" {
+		contentType = http.DetectContentType(resp.Body)
+	}
+	// implementation of mime.ParseMediaType without parsing the params
+	// part
+	mediatype, _, _ := strings.Cut(contentType, ";")
+	mediatype = strings.TrimSpace(strings.ToLower(mediatype))
+
+	// TODO we also want to parse application/xml as XHTML if it has
+	// appropriate doctype
+	switch mediatype {
+	case "text/html", "application/xhtml+xml":
+	default:
+		return nil
+	}
+
 	doc, err := goquery.NewDocumentFromReader(bytes.NewBuffer(resp.Body))
 	if err != nil {
 		return err

diff --git a/colly_test.go b/colly_test.go
@@ -52,7 +52,11 @@ func newUnstartedTestServer() *httptest.Server {
 	})
 
 	mux.HandleFunc("/html", func(w http.ResponseWriter, r *http.Request) {
-		w.Header().Set("Content-Type", "text/html")
+		if r.URL.Query().Get("no-content-type") != "" {
+			w.Header()["Content-Type"] = nil
+		} else {
+			w.Header().Set("Content-Type", "text/html")
+		}
 		w.Write([]byte(`<!DOCTYPE html>
 <html>
 <head>
@@ -627,6 +631,34 @@ func TestCollectorOnHTML(t *testing.T) {
 	}
 }
 
+func TestCollectorContentSniffing(t *testing.T) {
+	ts := newTestServer()
+	defer ts.Close()
+
+	c := NewCollector()
+
+	htmlCallbackCalled := false
+
+	c.OnResponse(func(r *Response) {
+		if (*r.Headers)["Content-Type"] != nil {
+			t.Error("Content-Type unexpectedly not nil")
+		}
+	})
+
+	c.OnHTML("html", func(e *HTMLElement) {
+		htmlCallbackCalled = true
+	})
+
+	err := c.Visit(ts.URL + "/html?no-content-type=yes")
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if !htmlCallbackCalled {
+		t.Error("OnHTML was not called")
+	}
+}
+
 func TestCollectorURLRevisit(t *testing.T) {
 	ts := newTestServer()
 	defer ts.Close()