Skip to content

Commit

Permalink
search: change endpoint to media-imdb.com, json, faster
Browse files Browse the repository at this point in the history
  • Loading branch information
windblows95 authored and StalkR committed Aug 3, 2024
1 parent ce3510f commit ba11501
Show file tree
Hide file tree
Showing 2 changed files with 105 additions and 214 deletions.
153 changes: 47 additions & 106 deletions search.go
Original file line number Diff line number Diff line change
@@ -1,139 +1,80 @@
package imdb

import (
"encoding/json"
"errors"
"fmt"
"io/ioutil"
"io"
"net/http"
"net/url"
"regexp"
"strconv"
"strings"
)

const searchURL = "https://www.imdb.com/find?%s"
const searchURL = "https://v3.sg.media-imdb.com/suggestion/x/%s.json?includeVideos=0"

var (
titleIDLinkRE = regexp.MustCompile(`<link rel="canonical" href="https://www.imdb.com/title/(tt\d+)/"`)
// searchTitleListRE matches on results list. (?s) for multi-line.
searchTitleListRE = regexp.MustCompile(`(?s)<table class="findList">(.*?)</table>`)
// searchTitleRE matches on titles.
searchTitleRE = regexp.MustCompile(`<a href="/title/(tt\d+).*?>([^<]+)</a>([^<]+)`)
searchExtraRE = regexp.MustCompile(`\(([^(]+)\)`)
searchYearRE = regexp.MustCompile(`\d{4}`)
// Ignore roman numerals used for duplicates in a year, e.g. Title (II) (2000) (TV Series)
searchRomanRE = regexp.MustCompile(`^[MDCLXVI]+$`)
)
// SearchQueryResponse represents the JSON struct returned by a query on searchURL
type SearchQueryResponse struct {
Matches []struct {
Image struct {
Height int `json:"height"`
ImageURL string `json:"imageUrl"`
Width int `json:"width"`
} `json:"i"`
ID string `json:"id"`
Title string `json:"l"`
MainActors string `json:"s"`
Q string `json:"q,omitempty"`
Type string `json:"qid,omitempty"`
Rank int `json:"rank,omitempty"`
Year int `json:"y,omitempty"`
YearInProduction string `json:"yr,omitempty"`
} `json:"d"`
Query string `json:"q"`
V int `json:"v"`
}

// SearchTitle searches for titles matching name and returns partial Titles.
// A partial Title has only ID, URL, Name and Year set.
// A full Title can be obtained with NewTitle, at the cost of extra requests.
func SearchTitle(c *http.Client, name string) ([]Title, error) {
// Sections: all, tt, ep, nm, co, kw, ch, vi, qu, bi, pl
params := url.Values{"q": {name}, "s": {"tt"}}
resp, err := c.Get(fmt.Sprintf(searchURL, params.Encode()))
resp, err := c.Get(fmt.Sprintf(searchURL, name))
if err != nil {
return nil, err
}
defer resp.Body.Close()
defer func(Body io.ReadCloser) {
_ = Body.Close()
}(resp.Body)
if resp.StatusCode != http.StatusOK {
if resp.StatusCode == http.StatusForbidden {
return nil, errors.New("forbidden (e.g. denied by AWS WAF)")
}
return nil, fmt.Errorf("imdb: status not ok: %v", resp.Status)
}
page, err := ioutil.ReadAll(resp.Body)
if err != nil {
return nil, err
}

list := searchTitleListRE.FindSubmatch(page)
if list == nil {
return newSearchTitle(page)
}
results := searchTitleRE.FindAllSubmatch(list[1], -1)
if results == nil {
return nil, nil // No results.
decoder := json.NewDecoder(resp.Body)
if decoder == nil {
return nil, errors.New("imdb: decoder is nil")
}

var t []Title
for _, r := range results {
var titleYear int
var titleType string
extras := searchExtraRE.FindAllSubmatch(r[3], -1)
// expecting 0-3 max (any of roman/type/year)
if len(extras) > 3 {
return nil, fmt.Errorf("search: too many extras")
}
for i, x := range extras {
if i == 0 && searchRomanRE.Match(x[1]) {
continue // ignore roman numerals used for duplicates in a year
}
if digits := searchYearRE.FindSubmatch(x[1]); digits != nil {
year, err := strconv.Atoi(string(digits[0]))
if err != nil {
return nil, err // should not happen as regexp matches digits
}
titleYear = year
} else {
titleType = string(x[1])
}
}
id := string(r[1])
t = append(t, Title{
ID: id,
URL: fmt.Sprintf(titleURL, id),
Name: decode(string(r[2])),
Year: titleYear,
Type: titleType,
})
}
return t, nil
}

var (
// searchTitleListRE matches on each result in the list.
newSearchTitleListRE = regexp.MustCompile(`<div class="ipc-metadata-list-summary-item__tc">(.*?)</div>`)
// searchTitleRE matches on titles.
newSearchTitleRE = regexp.MustCompile(`<a .*?href="/title/(tt\d+).*?>([^<]+)</a>`)
newSearchYearRE = regexp.MustCompile(`<ul\s*[^>]*>.*?<(?:span|label)\s*[^>]*>(\d{4})[^<]*</(?:span|label)>(.*?)</ul>`)
newSearchTypeRE = regexp.MustCompile(`<span\s*[^>]*>([^<]*)</span>`) // from within the year 2nd capture group
)

func newSearchTitle(page []byte) ([]Title, error) {
list := newSearchTitleListRE.FindAllSubmatch(page, -1)
if list == nil {
return nil, NewErrParse("list")
var searchResponse SearchQueryResponse
err = decoder.Decode(&searchResponse)
if err != nil {
return nil, err
}

var t []Title
for _, e := range list {
r := newSearchTitleRE.FindSubmatch(e[1])
if r == nil {
return nil, NewErrParse("list title")
for _, match := range searchResponse.Matches {
if strings.HasPrefix(match.ID, "tt") {
t = append(t, Title{
ID: match.ID,
URL: fmt.Sprintf(titleURL, match.ID),
Name: match.Title,
Year: match.Year,
Type: match.Type,
Poster: Media{
URL: match.Image.ImageURL,
},
})
}
id := string(r[1])

var titleYear int
var titleType string

if m := newSearchYearRE.FindSubmatch(e[1]); m != nil {
year, err := strconv.Atoi(string(m[1]))
if err != nil {
return nil, err // should not happen as regexp matches digits
}
titleYear = year
if m := newSearchTypeRE.FindSubmatch(m[2]); m != nil {
titleType = string(m[1])
}
}

t = append(t, Title{
ID: id,
URL: fmt.Sprintf(titleURL, id),
Name: decode(string(r[2])),
Year: titleYear,
Type: titleType,
})
}
return t, nil
}
166 changes: 58 additions & 108 deletions search_test.go
Original file line number Diff line number Diff line change
@@ -1,118 +1,68 @@
package imdb

import (
"fmt"
"testing"
)

func TestSearchTitle(t *testing.T) {
title := "Lord of the rings"
r, err := SearchTitle(client, title)
func TestSearch(t *testing.T) {
want := []Title{
{
ID: "tt4647692",
URL: "https://www.imdb.com/title/tt4647692",
Name: "Letterkenny",
Year: 2016,
Type: "tvSeries",
Poster: Media{ID: "", TitleID: "", URL: "https://m.media-amazon.com/images/M/MV5BMjM2MjE3MDktODU4Yi00YzY5LTgzMDktYWIzMTMxYmU2YzM5XkEyXkFqcGdeQXVyMTkxNjUyNQ@@._V1_.jpg", ContentURL: ""},
},
{
ID: "tt3238856",
URL: "https://www.imdb.com/title/tt3238856",
Name: "Letterkenny Problems",
Year: 2013,
Type: "tvSeries",
Poster: Media{ID: "", TitleID: "", URL: "https://m.media-amazon.com/images/M/MV5BM2UxNjU4NDYtNzk4Yi00MDAzLWE5MjgtMjI4NDhiZWNlNTVkXkEyXkFqcGdeQXVyMTgxNDgxOQ@@._V1_.jpg", ContentURL: ""},
},
{
ID: "tt3913450",
URL: "https://www.imdb.com/title/tt3913450",
Name: "Letterkenny People",
Year: 2014,
Type: "tvSeries",
Poster: Media{ID: "", TitleID: "", URL: "https://m.media-amazon.com/images/M/MV5BYTI5OWI2NjMtODhlMS00Zjk5LThhYWEtOWFlYWQ0MDMzNGMyXkEyXkFqcGdeQXVyNjUxMjc1OTM@._V1_.jpg", ContentURL: ""},
},
{
ID: "tt30767687",
URL: "https://www.imdb.com/title/tt30767687",
Name: "The Produce Stand: Letterkenny",
Year: 2020,
Type: "podcastSeries",
Poster: Media{ID: "", TitleID: "", URL: "https://m.media-amazon.com/images/M/MV5BMWViNmQ4YjAtYzI4MS00YTA3LThlNzctZTkyM2RjODNjNDcxXkEyXkFqcGdeQXVyNDY3MzkyMjM@._V1_.jpg", ContentURL: ""},
},
{
ID: "tt0038948",
URL: "https://www.imdb.com/title/tt0038948",
Name: "Sister Kenny",
Year: 1946,
Type: "movie",
Poster: Media{ID: "", TitleID: "", URL: "https://m.media-amazon.com/images/M/MV5BYjZlMzc0YjgtNzgwYi00NzAyLTk4YjUtNTMyMzEwYzM2MzgwXkEyXkFqcGdeQXVyNjc0MzMzNjA@._V1_.jpg", ContentURL: ""},
},
{
ID: "tt5572524",
URL: "https://www.imdb.com/title/tt5572524",
Name: "A Letter from Rose Kennedy",
Year: 0,
Type: "movie",
Poster: Media{ID: "", TitleID: "", URL: "", ContentURL: ""},
},
}
got, err := SearchTitle(client, "Letterkenny")
if err != nil {
t.Fatalf("SearchTitle(%s) error: %v", title, err)
}
if len(r) < 10 {
t.Fatalf("SearchTitle(%s) len < 50: %d", title, len(r))
}
if accepted := map[string]bool{
"tt7631058": true, // The Lord of the Rings: The Rings of Power (2024-) (TV Series)
"tt14824600": true, // The Lord of the Rings: The War of the Rohirrim (2024) (Animation)
"tt0120737": true, // The Lord of the Rings: The Fellowship of the Ring (2001)
"tt0167260": true, // The Lord of the Rings: The Return of the King (2003)
"tt0167261": true, // The Lord of the Rings: The Two Towers (2002)
}; !accepted[r[0].ID] {
t.Errorf("SearchTitle(%s)[0].ID = %v; want any of %v", title, r[0].ID, accepted)
}
if accepted := map[string]bool{
"The Lord of the Rings: The Rings of Power": true,
"The Lord of the Rings: The War of the Rohirrim": true,
"The Lord of the Rings: The Fellowship of the Ring": true,
"The Lord of the Rings: The Return of the King": true,
"The Lord of the Rings: The Two Towers": true,
}; !accepted[r[0].Name] {
t.Errorf("SearchTitle(%s)[0].Name = %v; want any of %v", title, r[0].Name, accepted)
}
errors := []string{}
for i, want := range []int{
2022,
2024,
2001,
2003,
2002,
} {
if r[i].Year != want {
errors = append(errors, fmt.Sprintf("SearchTitle(%s)[%d].Year = %d; want %d", title, i, r[i].Year, want))
}
}
if len(errors) > 3 {
t.Errorf("> 3 errors: %v", errors)
}
errors = []string{}
for i, want := range []string{
"TV Series",
"",
"",
"",
"TV Series",
} {
if r[i].Type != want {
errors = append(errors, fmt.Sprintf("SearchTitle(%s)[%d].Type = %s; want %s", title, i, r[i].Type, want))
t.Errorf("SearchTitle(\"Letterkenny\") error: %v", err)
} else {
for i, wGot := range got {
if err := diffStruct(wGot, want[i]); err != nil {
t.Errorf("SearchTitle(\"Letterkenny\") error: %v", err)
}
}
}
if len(errors) > 3 {
t.Errorf("> 3 errors: %v", errors)
}
}

func TestSearchTitleUnicode(t *testing.T) {
title := "Les Filles De L'Océan"
r, err := SearchTitle(client, title)
if err != nil {
t.Fatalf("SearchTitle(%s) error: %v", title, err)
}
if len(r) == 0 {
t.Fatalf("SearchTitle(%s) len = %d; want %d", title, len(r), 1)
}
if accepted := map[string]bool{
"tt5761478": true, // Harlots (TV Series) (2017-2019)
"tt0244764": true, // Rip Girls (TV Movie) (2000)
"tt0098797": true, // Les filles de Caleb (TV Series) (1990-)
"tt22522556": true, // Les Filles de l'Océan
}; !accepted[r[0].ID] {
t.Errorf("SearchTitle(%s)[0] = %v; want any of %v", title, r[0].ID, accepted)
}
}

func TestSearchTitlePositions(t *testing.T) {
title := "Burlesque"
r, err := SearchTitle(client, title)
if err != nil {
t.Fatalf("SearchTitle(%s) error: %v", title, err)
}
if len(r) < 3 {
t.Fatalf("SearchTitle(%s) len = %d; want %d", title, len(r), 1)
}
if accepted := map[string]bool{
"tt1126591": true, // Burlesque (I) (2010)
"tt1586713": true, // Burlesque (II) (2010)
"tt11288016": true, // Jak si nepodelat zivot (2019) (TV Mini Series) aka "Burlesque"
}; !accepted[r[0].ID] {
t.Errorf("SearchTitle(%s)[0] = %v; want any of %v", title, r[0].ID, accepted)
}
}

func TestMachete(t *testing.T) {
title := "Machete Kills Again... In Space!"
r, err := SearchTitle(client, title)
if err != nil {
t.Fatalf("SearchTitle(%s) error: %v", title, err)
}
if len(r) == 0 {
t.Fatalf("SearchTitle(%s) len = %d; want > 0", title, len(r))
}
if accepted := map[string]bool{
"tt2002719": true,
}; !accepted[r[0].ID] {
t.Errorf("SearchTitle(%s)[0] = %v; want any of %v", title, r[0].ID, accepted)
}
}

0 comments on commit ba11501

Please sign in to comment.