-
Notifications
You must be signed in to change notification settings - Fork 11
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
search: change endpoint to media-imdb.com, json, faster
- Loading branch information
1 parent
ce3510f
commit ba11501
Showing
2 changed files
with
105 additions
and
214 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,139 +1,80 @@ | ||
package imdb | ||
|
||
import ( | ||
"encoding/json" | ||
"errors" | ||
"fmt" | ||
"io/ioutil" | ||
"io" | ||
"net/http" | ||
"net/url" | ||
"regexp" | ||
"strconv" | ||
"strings" | ||
) | ||
|
||
const searchURL = "https://www.imdb.com/find?%s" | ||
const searchURL = "https://v3.sg.media-imdb.com/suggestion/x/%s.json?includeVideos=0" | ||
|
||
var ( | ||
titleIDLinkRE = regexp.MustCompile(`<link rel="canonical" href="https://www.imdb.com/title/(tt\d+)/"`) | ||
// searchTitleListRE matches on results list. (?s) for multi-line. | ||
searchTitleListRE = regexp.MustCompile(`(?s)<table class="findList">(.*?)</table>`) | ||
// searchTitleRE matches on titles. | ||
searchTitleRE = regexp.MustCompile(`<a href="/title/(tt\d+).*?>([^<]+)</a>([^<]+)`) | ||
searchExtraRE = regexp.MustCompile(`\(([^(]+)\)`) | ||
searchYearRE = regexp.MustCompile(`\d{4}`) | ||
// Ignore roman numerals used for duplicates in a year, e.g. Title (II) (2000) (TV Series) | ||
searchRomanRE = regexp.MustCompile(`^[MDCLXVI]+$`) | ||
) | ||
// SearchQueryResponse represents the JSON struct returned by a query on searchURL | ||
type SearchQueryResponse struct { | ||
Matches []struct { | ||
Image struct { | ||
Height int `json:"height"` | ||
ImageURL string `json:"imageUrl"` | ||
Width int `json:"width"` | ||
} `json:"i"` | ||
ID string `json:"id"` | ||
Title string `json:"l"` | ||
MainActors string `json:"s"` | ||
Q string `json:"q,omitempty"` | ||
Type string `json:"qid,omitempty"` | ||
Rank int `json:"rank,omitempty"` | ||
Year int `json:"y,omitempty"` | ||
YearInProduction string `json:"yr,omitempty"` | ||
} `json:"d"` | ||
Query string `json:"q"` | ||
V int `json:"v"` | ||
} | ||
|
||
// SearchTitle searches for titles matching name and returns partial Titles. | ||
// A partial Title has only ID, URL, Name and Year set. | ||
// A full Title can be obtained with NewTitle, at the cost of extra requests. | ||
func SearchTitle(c *http.Client, name string) ([]Title, error) { | ||
// Sections: all, tt, ep, nm, co, kw, ch, vi, qu, bi, pl | ||
params := url.Values{"q": {name}, "s": {"tt"}} | ||
resp, err := c.Get(fmt.Sprintf(searchURL, params.Encode())) | ||
resp, err := c.Get(fmt.Sprintf(searchURL, name)) | ||
if err != nil { | ||
return nil, err | ||
} | ||
defer resp.Body.Close() | ||
defer func(Body io.ReadCloser) { | ||
_ = Body.Close() | ||
}(resp.Body) | ||
if resp.StatusCode != http.StatusOK { | ||
if resp.StatusCode == http.StatusForbidden { | ||
return nil, errors.New("forbidden (e.g. denied by AWS WAF)") | ||
} | ||
return nil, fmt.Errorf("imdb: status not ok: %v", resp.Status) | ||
} | ||
page, err := ioutil.ReadAll(resp.Body) | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
list := searchTitleListRE.FindSubmatch(page) | ||
if list == nil { | ||
return newSearchTitle(page) | ||
} | ||
results := searchTitleRE.FindAllSubmatch(list[1], -1) | ||
if results == nil { | ||
return nil, nil // No results. | ||
decoder := json.NewDecoder(resp.Body) | ||
if decoder == nil { | ||
return nil, errors.New("imdb: decoder is nil") | ||
} | ||
|
||
var t []Title | ||
for _, r := range results { | ||
var titleYear int | ||
var titleType string | ||
extras := searchExtraRE.FindAllSubmatch(r[3], -1) | ||
// expecting 0-3 max (any of roman/type/year) | ||
if len(extras) > 3 { | ||
return nil, fmt.Errorf("search: too many extras") | ||
} | ||
for i, x := range extras { | ||
if i == 0 && searchRomanRE.Match(x[1]) { | ||
continue // ignore roman numerals used for duplicates in a year | ||
} | ||
if digits := searchYearRE.FindSubmatch(x[1]); digits != nil { | ||
year, err := strconv.Atoi(string(digits[0])) | ||
if err != nil { | ||
return nil, err // should not happen as regexp matches digits | ||
} | ||
titleYear = year | ||
} else { | ||
titleType = string(x[1]) | ||
} | ||
} | ||
id := string(r[1]) | ||
t = append(t, Title{ | ||
ID: id, | ||
URL: fmt.Sprintf(titleURL, id), | ||
Name: decode(string(r[2])), | ||
Year: titleYear, | ||
Type: titleType, | ||
}) | ||
} | ||
return t, nil | ||
} | ||
|
||
var ( | ||
// searchTitleListRE matches on each result in the list. | ||
newSearchTitleListRE = regexp.MustCompile(`<div class="ipc-metadata-list-summary-item__tc">(.*?)</div>`) | ||
// searchTitleRE matches on titles. | ||
newSearchTitleRE = regexp.MustCompile(`<a .*?href="/title/(tt\d+).*?>([^<]+)</a>`) | ||
newSearchYearRE = regexp.MustCompile(`<ul\s*[^>]*>.*?<(?:span|label)\s*[^>]*>(\d{4})[^<]*</(?:span|label)>(.*?)</ul>`) | ||
newSearchTypeRE = regexp.MustCompile(`<span\s*[^>]*>([^<]*)</span>`) // from within the year 2nd capture group | ||
) | ||
|
||
func newSearchTitle(page []byte) ([]Title, error) { | ||
list := newSearchTitleListRE.FindAllSubmatch(page, -1) | ||
if list == nil { | ||
return nil, NewErrParse("list") | ||
var searchResponse SearchQueryResponse | ||
err = decoder.Decode(&searchResponse) | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
var t []Title | ||
for _, e := range list { | ||
r := newSearchTitleRE.FindSubmatch(e[1]) | ||
if r == nil { | ||
return nil, NewErrParse("list title") | ||
for _, match := range searchResponse.Matches { | ||
if strings.HasPrefix(match.ID, "tt") { | ||
t = append(t, Title{ | ||
ID: match.ID, | ||
URL: fmt.Sprintf(titleURL, match.ID), | ||
Name: match.Title, | ||
Year: match.Year, | ||
Type: match.Type, | ||
Poster: Media{ | ||
URL: match.Image.ImageURL, | ||
}, | ||
}) | ||
} | ||
id := string(r[1]) | ||
|
||
var titleYear int | ||
var titleType string | ||
|
||
if m := newSearchYearRE.FindSubmatch(e[1]); m != nil { | ||
year, err := strconv.Atoi(string(m[1])) | ||
if err != nil { | ||
return nil, err // should not happen as regexp matches digits | ||
} | ||
titleYear = year | ||
if m := newSearchTypeRE.FindSubmatch(m[2]); m != nil { | ||
titleType = string(m[1]) | ||
} | ||
} | ||
|
||
t = append(t, Title{ | ||
ID: id, | ||
URL: fmt.Sprintf(titleURL, id), | ||
Name: decode(string(r[2])), | ||
Year: titleYear, | ||
Type: titleType, | ||
}) | ||
} | ||
return t, nil | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,118 +1,68 @@ | ||
package imdb | ||
|
||
import ( | ||
"fmt" | ||
"testing" | ||
) | ||
|
||
func TestSearchTitle(t *testing.T) { | ||
title := "Lord of the rings" | ||
r, err := SearchTitle(client, title) | ||
func TestSearch(t *testing.T) { | ||
want := []Title{ | ||
{ | ||
ID: "tt4647692", | ||
URL: "https://www.imdb.com/title/tt4647692", | ||
Name: "Letterkenny", | ||
Year: 2016, | ||
Type: "tvSeries", | ||
Poster: Media{ID: "", TitleID: "", URL: "https://m.media-amazon.com/images/M/MV5BMjM2MjE3MDktODU4Yi00YzY5LTgzMDktYWIzMTMxYmU2YzM5XkEyXkFqcGdeQXVyMTkxNjUyNQ@@._V1_.jpg", ContentURL: ""}, | ||
}, | ||
{ | ||
ID: "tt3238856", | ||
URL: "https://www.imdb.com/title/tt3238856", | ||
Name: "Letterkenny Problems", | ||
Year: 2013, | ||
Type: "tvSeries", | ||
Poster: Media{ID: "", TitleID: "", URL: "https://m.media-amazon.com/images/M/MV5BM2UxNjU4NDYtNzk4Yi00MDAzLWE5MjgtMjI4NDhiZWNlNTVkXkEyXkFqcGdeQXVyMTgxNDgxOQ@@._V1_.jpg", ContentURL: ""}, | ||
}, | ||
{ | ||
ID: "tt3913450", | ||
URL: "https://www.imdb.com/title/tt3913450", | ||
Name: "Letterkenny People", | ||
Year: 2014, | ||
Type: "tvSeries", | ||
Poster: Media{ID: "", TitleID: "", URL: "https://m.media-amazon.com/images/M/MV5BYTI5OWI2NjMtODhlMS00Zjk5LThhYWEtOWFlYWQ0MDMzNGMyXkEyXkFqcGdeQXVyNjUxMjc1OTM@._V1_.jpg", ContentURL: ""}, | ||
}, | ||
{ | ||
ID: "tt30767687", | ||
URL: "https://www.imdb.com/title/tt30767687", | ||
Name: "The Produce Stand: Letterkenny", | ||
Year: 2020, | ||
Type: "podcastSeries", | ||
Poster: Media{ID: "", TitleID: "", URL: "https://m.media-amazon.com/images/M/MV5BMWViNmQ4YjAtYzI4MS00YTA3LThlNzctZTkyM2RjODNjNDcxXkEyXkFqcGdeQXVyNDY3MzkyMjM@._V1_.jpg", ContentURL: ""}, | ||
}, | ||
{ | ||
ID: "tt0038948", | ||
URL: "https://www.imdb.com/title/tt0038948", | ||
Name: "Sister Kenny", | ||
Year: 1946, | ||
Type: "movie", | ||
Poster: Media{ID: "", TitleID: "", URL: "https://m.media-amazon.com/images/M/MV5BYjZlMzc0YjgtNzgwYi00NzAyLTk4YjUtNTMyMzEwYzM2MzgwXkEyXkFqcGdeQXVyNjc0MzMzNjA@._V1_.jpg", ContentURL: ""}, | ||
}, | ||
{ | ||
ID: "tt5572524", | ||
URL: "https://www.imdb.com/title/tt5572524", | ||
Name: "A Letter from Rose Kennedy", | ||
Year: 0, | ||
Type: "movie", | ||
Poster: Media{ID: "", TitleID: "", URL: "", ContentURL: ""}, | ||
}, | ||
} | ||
got, err := SearchTitle(client, "Letterkenny") | ||
if err != nil { | ||
t.Fatalf("SearchTitle(%s) error: %v", title, err) | ||
} | ||
if len(r) < 10 { | ||
t.Fatalf("SearchTitle(%s) len < 50: %d", title, len(r)) | ||
} | ||
if accepted := map[string]bool{ | ||
"tt7631058": true, // The Lord of the Rings: The Rings of Power (2024-) (TV Series) | ||
"tt14824600": true, // The Lord of the Rings: The War of the Rohirrim (2024) (Animation) | ||
"tt0120737": true, // The Lord of the Rings: The Fellowship of the Ring (2001) | ||
"tt0167260": true, // The Lord of the Rings: The Return of the King (2003) | ||
"tt0167261": true, // The Lord of the Rings: The Two Towers (2002) | ||
}; !accepted[r[0].ID] { | ||
t.Errorf("SearchTitle(%s)[0].ID = %v; want any of %v", title, r[0].ID, accepted) | ||
} | ||
if accepted := map[string]bool{ | ||
"The Lord of the Rings: The Rings of Power": true, | ||
"The Lord of the Rings: The War of the Rohirrim": true, | ||
"The Lord of the Rings: The Fellowship of the Ring": true, | ||
"The Lord of the Rings: The Return of the King": true, | ||
"The Lord of the Rings: The Two Towers": true, | ||
}; !accepted[r[0].Name] { | ||
t.Errorf("SearchTitle(%s)[0].Name = %v; want any of %v", title, r[0].Name, accepted) | ||
} | ||
errors := []string{} | ||
for i, want := range []int{ | ||
2022, | ||
2024, | ||
2001, | ||
2003, | ||
2002, | ||
} { | ||
if r[i].Year != want { | ||
errors = append(errors, fmt.Sprintf("SearchTitle(%s)[%d].Year = %d; want %d", title, i, r[i].Year, want)) | ||
} | ||
} | ||
if len(errors) > 3 { | ||
t.Errorf("> 3 errors: %v", errors) | ||
} | ||
errors = []string{} | ||
for i, want := range []string{ | ||
"TV Series", | ||
"", | ||
"", | ||
"", | ||
"TV Series", | ||
} { | ||
if r[i].Type != want { | ||
errors = append(errors, fmt.Sprintf("SearchTitle(%s)[%d].Type = %s; want %s", title, i, r[i].Type, want)) | ||
t.Errorf("SearchTitle(\"Letterkenny\") error: %v", err) | ||
} else { | ||
for i, wGot := range got { | ||
if err := diffStruct(wGot, want[i]); err != nil { | ||
t.Errorf("SearchTitle(\"Letterkenny\") error: %v", err) | ||
} | ||
} | ||
} | ||
if len(errors) > 3 { | ||
t.Errorf("> 3 errors: %v", errors) | ||
} | ||
} | ||
|
||
func TestSearchTitleUnicode(t *testing.T) { | ||
title := "Les Filles De L'Océan" | ||
r, err := SearchTitle(client, title) | ||
if err != nil { | ||
t.Fatalf("SearchTitle(%s) error: %v", title, err) | ||
} | ||
if len(r) == 0 { | ||
t.Fatalf("SearchTitle(%s) len = %d; want %d", title, len(r), 1) | ||
} | ||
if accepted := map[string]bool{ | ||
"tt5761478": true, // Harlots (TV Series) (2017-2019) | ||
"tt0244764": true, // Rip Girls (TV Movie) (2000) | ||
"tt0098797": true, // Les filles de Caleb (TV Series) (1990-) | ||
"tt22522556": true, // Les Filles de l'Océan | ||
}; !accepted[r[0].ID] { | ||
t.Errorf("SearchTitle(%s)[0] = %v; want any of %v", title, r[0].ID, accepted) | ||
} | ||
} | ||
|
||
func TestSearchTitlePositions(t *testing.T) { | ||
title := "Burlesque" | ||
r, err := SearchTitle(client, title) | ||
if err != nil { | ||
t.Fatalf("SearchTitle(%s) error: %v", title, err) | ||
} | ||
if len(r) < 3 { | ||
t.Fatalf("SearchTitle(%s) len = %d; want %d", title, len(r), 1) | ||
} | ||
if accepted := map[string]bool{ | ||
"tt1126591": true, // Burlesque (I) (2010) | ||
"tt1586713": true, // Burlesque (II) (2010) | ||
"tt11288016": true, // Jak si nepodelat zivot (2019) (TV Mini Series) aka "Burlesque" | ||
}; !accepted[r[0].ID] { | ||
t.Errorf("SearchTitle(%s)[0] = %v; want any of %v", title, r[0].ID, accepted) | ||
} | ||
} | ||
|
||
func TestMachete(t *testing.T) { | ||
title := "Machete Kills Again... In Space!" | ||
r, err := SearchTitle(client, title) | ||
if err != nil { | ||
t.Fatalf("SearchTitle(%s) error: %v", title, err) | ||
} | ||
if len(r) == 0 { | ||
t.Fatalf("SearchTitle(%s) len = %d; want > 0", title, len(r)) | ||
} | ||
if accepted := map[string]bool{ | ||
"tt2002719": true, | ||
}; !accepted[r[0].ID] { | ||
t.Errorf("SearchTitle(%s)[0] = %v; want any of %v", title, r[0].ID, accepted) | ||
} | ||
} |