From 4eb322b4420a5efaf0e2ef53245858b7f23d5682 Mon Sep 17 00:00:00 2001 From: Eric Fixler Date: Thu, 8 Aug 2024 10:06:27 -0400 Subject: [PATCH] Rename client identifier type, add some docs (#50) --- fetch/client.go | 4 +-- internal/headless/fetch_client.go | 2 +- internal/server/api/server_test.go | 4 +-- internal/server/version/version.go | 2 +- internal/settings/benchmark_test.go | 2 +- internal/settings/domain.go | 10 +++---- internal/settings/domain_test.go | 4 +-- internal/storage/storage.go | 2 +- resource/feed.go | 3 ++ resource/fetch_method.go | 26 ++++++++-------- resource/fetch_method_test.go | 14 ++++----- resource/url.go | 1 + resource/web_page.go | 46 ++++++++++++++--------------- resource/web_page_test.go | 2 +- 14 files changed, 63 insertions(+), 59 deletions(-) diff --git a/fetch/client.go b/fetch/client.go index 634f75b..af359f2 100644 --- a/fetch/client.go +++ b/fetch/client.go @@ -18,7 +18,7 @@ const ( type Client interface { Get(url string, headers http.Header) (*http.Response, error) - Identifier() resource.FetchClient + Identifier() resource.ClientIdentifier } type ClientOption func(*defaultClient) error @@ -49,7 +49,7 @@ type defaultClient struct { httpClient *http.Client } -func (c defaultClient) Identifier() resource.FetchClient { +func (c defaultClient) Identifier() resource.ClientIdentifier { return resource.DefaultClient } diff --git a/internal/headless/fetch_client.go b/internal/headless/fetch_client.go index 8213c56..6029192 100644 --- a/internal/headless/fetch_client.go +++ b/internal/headless/fetch_client.go @@ -40,7 +40,7 @@ func NewChromeClient(ctx context.Context, userAgent string, maxConcurrent int) ( return c, nil } -func (c client) Identifier() resource.FetchClient { +func (c client) Identifier() resource.ClientIdentifier { return resource.HeadlessChromium } diff --git a/internal/server/api/server_test.go b/internal/server/api/server_test.go index f619dbf..8b847fb 100644 --- a/internal/server/api/server_test.go +++ b/internal/server/api/server_test.go @@ -23,7 +23,7 @@ import ( ) type mockUrlFetcher struct { - fetchMethod resource.FetchClient + fetchMethod resource.ClientIdentifier } func (m *mockUrlFetcher) Fetch(url *nurl.URL) (*resource.WebPage, error) { @@ -175,7 +175,7 @@ func TestSingleHandler(t *testing.T) { name string url string handler http.HandlerFunc - expectMethod resource.FetchClient + expectMethod resource.ClientIdentifier }{ { name: "client", diff --git a/internal/server/version/version.go b/internal/server/version/version.go index 8252b2d..f86b219 100644 --- a/internal/server/version/version.go +++ b/internal/server/version/version.go @@ -1,7 +1,7 @@ package version const ( - Commit = "1a12247" + Commit = "0311158" Tag = "v0.8.6" RepoURL = "https://github.com/efixler/scrape" ) diff --git a/internal/settings/benchmark_test.go b/internal/settings/benchmark_test.go index 4a4c7ac..afd6c58 100644 --- a/internal/settings/benchmark_test.go +++ b/internal/settings/benchmark_test.go @@ -22,7 +22,7 @@ func populateTestDB(dbh *database.DBHandle, count int) ([]string, error) { ds := &DomainSettings{ Domain: d, Sitename: randomString(32), - FetchClient: resource.FetchClient(rand.Intn(3)), + FetchClient: resource.ClientIdentifier(rand.Intn(3)), UserAgent: ua.UserAgent(randomString(64)), Headers: map[string]string{ "x-token": randomString(rand.Intn(128) + 127), diff --git a/internal/settings/domain.go b/internal/settings/domain.go index f45e3f1..3908629 100644 --- a/internal/settings/domain.go +++ b/internal/settings/domain.go @@ -41,11 +41,11 @@ var ( ) type DomainSettings struct { - Domain string `json:"domain,omitempty"` - Sitename string `json:"sitename,omitempty"` - FetchClient resource.FetchClient `json:"fetch_client,omitempty"` - UserAgent ua.UserAgent `json:"user_agent,omitempty"` - Headers MIMEHeader `json:"headers,omitempty"` + Domain string `json:"domain,omitempty"` + Sitename string `json:"sitename,omitempty"` + FetchClient resource.ClientIdentifier `json:"fetch_client,omitempty"` + UserAgent ua.UserAgent `json:"user_agent,omitempty"` + Headers MIMEHeader `json:"headers,omitempty"` } // Domain names will be case-folded to lower case. diff --git a/internal/settings/domain_test.go b/internal/settings/domain_test.go index 11d64f7..7e675f3 100644 --- a/internal/settings/domain_test.go +++ b/internal/settings/domain_test.go @@ -24,7 +24,7 @@ func TestJSONUnmarshal(t *testing.T) { data string expectErr bool expectSitename string - expectFetchClient resource.FetchClient + expectFetchClient resource.ClientIdentifier expectUserAgent ua.UserAgent expectHeaders map[string]string }{ @@ -90,7 +90,7 @@ func TestJSONMarshal(t *testing.T) { expectErr bool expectJSON string expectSitename string - expectFetchClient resource.FetchClient + expectFetchClient resource.ClientIdentifier expectUserAgent ua.UserAgent expectHeaders map[string]string }{ diff --git a/internal/storage/storage.go b/internal/storage/storage.go index 7fee3a4..c0c0dec 100644 --- a/internal/storage/storage.go +++ b/internal/storage/storage.go @@ -175,7 +175,7 @@ func (s URLDataStore) Fetch(url *nurl.URL) (*resource.WebPage, error) { expiryEpoch int64 metadata string contentText string - fetchMethod resource.FetchClient + fetchMethod resource.ClientIdentifier ) err = rows.Scan(&canonicalUrl, &parsedUrl, &fetchEpoch, &expiryEpoch, &metadata, &contentText, &fetchMethod) if err != nil { diff --git a/resource/feed.go b/resource/feed.go index 13c9663..895f6b2 100644 --- a/resource/feed.go +++ b/resource/feed.go @@ -4,11 +4,14 @@ import ( "github.com/mmcdole/gofeed" ) +// Adds a RequestedURL field to the gofeed.Feed struct, +// along with the ItemLinks() function. type Feed struct { RequestedURL string `json:"requested_url,omitempty"` gofeed.Feed } +// Returns a slice of links for each item in the feed. func (f Feed) ItemLinks() []string { rval := make([]string, len(f.Items)) for i, item := range f.Items { diff --git a/resource/fetch_method.go b/resource/fetch_method.go index 879e308..302c6fa 100644 --- a/resource/fetch_method.go +++ b/resource/fetch_method.go @@ -7,50 +7,50 @@ import ( "fmt" ) -type FetchClient int +type ClientIdentifier int const ( - Unspecified FetchClient = iota + Unspecified ClientIdentifier = iota DefaultClient HeadlessChromium ) -var fetchMethods = map[FetchClient]string{ +var fetchClientNames = map[ClientIdentifier]string{ Unspecified: "unspecified", DefaultClient: "direct", HeadlessChromium: "chromium-headless", } -var ErrNoSuchFetchMethod = errors.New("no such FetchMethod") +var ErrNoSuchFetchMethod = errors.New("no such fetch client identifier") -func (f FetchClient) String() string { - if val, ok := fetchMethods[f]; ok { +func (f ClientIdentifier) String() string { + if val, ok := fetchClientNames[f]; ok { return val } else { return "Unknown" } } -func (f *FetchClient) UnmarshalText(data []byte) error { - for k, v := range fetchMethods { +func (f *ClientIdentifier) UnmarshalText(data []byte) error { + for k, v := range fetchClientNames { if v == string(data) { *f = k return nil } } return errors.Join( - fmt.Errorf("invalid FetchMethod %q", string(data)), + fmt.Errorf("invalid name %q", string(data)), ErrNoSuchFetchMethod, ) } -func (f FetchClient) MarshalText() ([]byte, error) { - if val, ok := fetchMethods[f]; ok { +func (f ClientIdentifier) MarshalText() ([]byte, error) { + if val, ok := fetchClientNames[f]; ok { return []byte(val), nil } else { - return []byte(fetchMethods[Unspecified]), + return []byte(fetchClientNames[Unspecified]), errors.Join( - fmt.Errorf("invalid FetchMethod %q", int(f)), + fmt.Errorf("invalid name %q", int(f)), ErrNoSuchFetchMethod, ) } diff --git a/resource/fetch_method_test.go b/resource/fetch_method_test.go index 9c75cf5..f405118 100644 --- a/resource/fetch_method_test.go +++ b/resource/fetch_method_test.go @@ -9,7 +9,7 @@ import ( func TestFetchMethodString(t *testing.T) { tests := []struct { name string - f FetchClient + f ClientIdentifier want string }{ { @@ -37,11 +37,11 @@ func TestFetchMethodString(t *testing.T) { func TestUnmarshal(t *testing.T) { type container struct { - F FetchClient `json:"fetch_method"` + F ClientIdentifier `json:"fetch_method"` } tests := []struct { input string - expectedValue FetchClient + expectedValue ClientIdentifier expectError bool }{ {input: "unspecified", expectedValue: Unspecified}, @@ -69,13 +69,13 @@ func TestMarshal(t *testing.T) { expectedValue string expectError bool }{ - {input: 0, expectedValue: fetchMethods[Unspecified]}, - {input: 1, expectedValue: fetchMethods[DefaultClient]}, - {input: 2, expectedValue: fetchMethods[HeadlessChromium]}, + {input: 0, expectedValue: fetchClientNames[Unspecified]}, + {input: 1, expectedValue: fetchClientNames[DefaultClient]}, + {input: 2, expectedValue: fetchClientNames[HeadlessChromium]}, {input: -1, expectError: true}, } for _, test := range tests { - fm := FetchClient(test.input) + fm := ClientIdentifier(test.input) val, err := fm.MarshalText() if (err != nil) != test.expectError { t.Errorf("%q expected error %v, got %v", test.input, test.expectError, err) diff --git a/resource/url.go b/resource/url.go index 76e5f03..f80c565 100644 --- a/resource/url.go +++ b/resource/url.go @@ -16,6 +16,7 @@ var illegalParams = []string{ "utm_brand", } +// CleanURL removes utm_ parameters from the URL func CleanURL(url *nurl.URL) *nurl.URL { if url == nil { return nil diff --git a/resource/web_page.go b/resource/web_page.go index 4038c3e..906c2e9 100644 --- a/resource/web_page.go +++ b/resource/web_page.go @@ -35,29 +35,29 @@ func NewWebPage(url nurl.URL) *WebPage { // Represents a web page that was fetched, including metadata from the page itself, // text content, and information about the fetch operation. type WebPage struct { // The page that was requested by the caller - RequestedURL *nurl.URL `json:"-"` // The page that was actually fetched - CanonicalURL *nurl.URL `json:"-"` - OriginalURL string `json:"original_url,omitempty"` // The canonical URL of the page - TTL time.Duration `json:"-"` // Time to live for the resource - FetchTime *time.Time `json:"fetch_time,omitempty"` // When the returned source was fetched - FetchMethod FetchClient `json:"fetch_method,omitempty"` // Method used to fetch the page - Hostname string `json:"hostname,omitempty"` // Hostname of the page - StatusCode int `json:"status_code,omitempty"` // HTTP status code - Error error `json:"error,omitempty"` - Title string `json:"title,omitempty"` // Title of the page - Description string `json:"description,omitempty"` // Description of the page - Sitename string `json:"sitename,omitempty"` // Name of the site - Authors []string `json:"authors,omitempty"` // Authors of the page - Date *time.Time `json:"date,omitempty"` // Date of the page - Categories []string `json:"categories,omitempty"` // Categories of the page - Tags []string `json:"tags,omitempty"` // Tags of the page - Language string `json:"language,omitempty"` // Language of the page - Image string `json:"image,omitempty"` // Image of the page - PageType string `json:"page_type,omitempty"` // Type of the page - License string `json:"license,omitempty"` // License of the page - ID string `json:"id,omitempty"` // ID of the page - Fingerprint string `json:"fingerprint,omitempty"` // Fingerprint of the page - ContentText string `json:"content_text,omitempty"` // Error that occurred during fetching + RequestedURL *nurl.URL `json:"-"` // The page that was actually fetched + CanonicalURL *nurl.URL `json:"-"` + OriginalURL string `json:"original_url,omitempty"` // The canonical URL of the page + TTL time.Duration `json:"-"` // Time to live for the resource + FetchTime *time.Time `json:"fetch_time,omitempty"` // When the returned source was fetched + FetchMethod ClientIdentifier `json:"fetch_method,omitempty"` // Method used to fetch the page + Hostname string `json:"hostname,omitempty"` // Hostname of the page + StatusCode int `json:"status_code,omitempty"` // HTTP status code + Error error `json:"error,omitempty"` + Title string `json:"title,omitempty"` // Title of the page + Description string `json:"description,omitempty"` // Description of the page + Sitename string `json:"sitename,omitempty"` // Name of the site + Authors []string `json:"authors,omitempty"` // Authors of the page + Date *time.Time `json:"date,omitempty"` // Date of the page + Categories []string `json:"categories,omitempty"` // Categories of the page + Tags []string `json:"tags,omitempty"` // Tags of the page + Language string `json:"language,omitempty"` // Language of the page + Image string `json:"image,omitempty"` // Image of the page + PageType string `json:"page_type,omitempty"` // Type of the page + License string `json:"license,omitempty"` // License of the page + ID string `json:"id,omitempty"` // ID of the page + Fingerprint string `json:"fingerprint,omitempty"` // Fingerprint of the page + ContentText string `json:"content_text,omitempty"` // Error that occurred during fetching skipMap map[skippable]bool } diff --git a/resource/web_page_test.go b/resource/web_page_test.go index 1bcc14c..4a3dd23 100644 --- a/resource/web_page_test.go +++ b/resource/web_page_test.go @@ -246,7 +246,7 @@ func TestExpireTime(t *testing.T) { func TestFetchMethod(t *testing.T) { tests := []struct { name string - f FetchClient + f ClientIdentifier want string }{ {