diff --git a/compose.production.yaml b/compose.production.yaml index 3056bdca5ca..9df7c18585e 100644 --- a/compose.production.yaml +++ b/compose.production.yaml @@ -73,6 +73,8 @@ services: - ../olsystem:/olsystem - /1:/1 deploy: + # Note: the replicas here must be kept in sync with the `upstream covers_backend` + # value in `docker/covers_nginx.conf`. replicas: 2 covers_nginx: diff --git a/docker/covers_nginx.conf b/docker/covers_nginx.conf index 0d6ec59e2cc..33f6c943933 100644 --- a/docker/covers_nginx.conf +++ b/docker/covers_nginx.conf @@ -16,6 +16,15 @@ server { ssl_prefer_server_ciphers on; } +# Docker's internal load balancing ends up with unbalanced connections eventually. +# This must be kept in sync with the `replicas` value in `compose.production.yaml` +# for the `covers` service. +upstream covers_backend { + least_conn; + server openlibrary-covers-1:7075; + server openlibrary-covers-2:7075; +} + server { listen 80; listen 443; @@ -25,8 +34,15 @@ server { keepalive_timeout 5; + # Return 429 errors as JSON. + error_page 429 = @429; + location @429 { + default_type application/json; + return 429 '{"status": 429, "message": "Too Many Requests. Please email us at info@archive.org"}'; + } + location / { - proxy_pass http://covers:7075; + proxy_pass http://covers_backend; proxy_set_header Host $http_host; # Gunicorn takes IP from this header @@ -37,8 +53,17 @@ server { proxy_set_header X-Scheme $scheme; if ($http_user_agent ~ (Bytespider) ) { - return 429; + return 444; } + + if ($http_user_agent ~ (CloudFront) ) { + return 444; + } + + + # Covers rate limit. + limit_req zone=cover_limit burst=400 nodelay; + limit_req_status 429; } location ^~ /.well-known/acme-challenge/ { diff --git a/docker/nginx.conf b/docker/nginx.conf index 406278a561a..7a4e8d13af5 100644 --- a/docker/nginx.conf +++ b/docker/nginx.conf @@ -11,7 +11,7 @@ error_log /var/log/nginx/error.log; pid /var/run/nginx.pid; events { - worker_connections 1024; + worker_connections 2048; # multi_accept on; } @@ -44,6 +44,25 @@ http { # Black-listed IPs include /olsystem/etc/nginx/deny.conf; + # Rate limiting: https://nginx.org/en/docs/http/ngx_http_limit_req_module.html + # No rate limit when IP obfuscation is not applied, as every IP is 255.0.0.0. + # These rules only do anything if invoked, e.g., in web_nginx.conf. + # TLDR: these rules can be disabled in `docker/web_nginx.conf` + # and `docker/covers_nginx.conf`. + geo $should_apply_limit { + 255.0.0.0 0; + default 1; + } + + map $should_apply_limit $rate_limit_key { + 0 ''; + 1 $binary_remote_addr; + } + + limit_req_zone $rate_limit_key zone=web_limit:10m rate=200r/m; + # Set a more permissive limit for covers because some pages might load 20+ covers. + limit_req_zone $rate_limit_key zone=cover_limit:10m rate=400r/m; + # Things are mounted into here by the docker compose file include /etc/nginx/sites-enabled/*; } diff --git a/docker/web_nginx.conf b/docker/web_nginx.conf index 960670eb739..654ee5168a5 100644 --- a/docker/web_nginx.conf +++ b/docker/web_nginx.conf @@ -64,6 +64,14 @@ server { if ($api_call = "http:noapi") { rewrite ^(.*)$ https://$http_host$1 last; } + + # Return 429 errors as JSON. + error_page 429 = @429; + location @429 { + default_type application/json; + return 429 '{"status": 429, "message": "Too Many Requests. Consider using https://openlibrary.org/developers/dumps."}'; + } + location / { proxy_pass http://webnodes; proxy_set_header Host $http_host; @@ -76,8 +84,13 @@ server { proxy_set_header X-Scheme $scheme; if ($http_user_agent ~ (Bytespider) ) { - return 429; + return 444; } + + + # Web rate limit. + limit_req zone=web_limit burst=200 nodelay; + limit_req_status 429; } location ^~ /.well-known/acme-challenge/ { diff --git a/openlibrary/components/BulkSearch.vue b/openlibrary/components/BulkSearch.vue index d972f62c915..f378f49c05d 100644 --- a/openlibrary/components/BulkSearch.vue +++ b/openlibrary/components/BulkSearch.vue @@ -1,5 +1,5 @@ @@ -39,7 +34,40 @@ export default { diff --git a/openlibrary/components/BulkSearch/components/TableRow.vue b/openlibrary/components/BulkSearch/components/TableRow.vue index 53239cfed73..1105b1cbcfe 100644 --- a/openlibrary/components/BulkSearch/components/TableRow.vue +++ b/openlibrary/components/BulkSearch/components/TableRow.vue @@ -41,11 +41,8 @@ th { padding: 4px; } .bookCards { - font-family: Roboto, sans-serif; display: flex; flex-direction: row; align-items: center; - - } diff --git a/openlibrary/components/BulkSearch/utils/classes.js b/openlibrary/components/BulkSearch/utils/classes.js index de47ed768db..756c9194f10 100644 --- a/openlibrary/components/BulkSearch/utils/classes.js +++ b/openlibrary/components/BulkSearch/utils/classes.js @@ -214,11 +214,11 @@ export class BulkSearchState{ this.extractionOptions = new ExtractionOptions(); /** @type {AbstractExtractor[]} */ this.extractors = [ - new RegexExtractor('e.g. "The Wizard of Oz by L. Frank Baum"', '(^|>)(?[A-Za-z][\\p{L}0-9\\- ,]{1,250})\\s+(by|[-\u2013\u2014\\t])\\s+(?<author>[\\p{L}][\\p{L}\\.\\- ]{3,70})( \\(.*)?($|<\\/)'), - new RegexExtractor('e.g. "L. Frank Baum - The Wizard of Oz"', '(^|>)(?<author>[A-Za-z][\\p{L}0-9\\- ,]{1,250})\\s+[,-\u2013\u2014\\t]\\s+(?<title>[\\p{L}][\\p{L}\\.\\- ]{3,70})( \\(.*)?($|<\\/)'), - new RegexExtractor('e.g. "The Wizard of Oz - L. Frank Baum"', '(^|>)(?<title>[A-Za-z][\\p{L}0-9\\- ,]{1,250})\\s+[,-\u2013\u2014\\t]\\s+(?<author>[\\p{L}][\\p{L}\\.\\- ]{3,70})( \\(.*)?($|<\\/)'), - new RegexExtractor('e.g. "The Wizard of Oz (L. Frank Baum)"', '^(?<title>[\\p{L}].{1,250})\\s\\(?<author>(.{3,70})\\)$$'), - new RegexExtractor('Wikipedia Citation (e.g. Baum, Frank L. (1994). The Wizard of Oz)', '^(?<author>[^.()]+).*?\\)\\. (?<title>[^.]+)'), + new RegexExtractor('Pattern: Title by Author', '(^|>)(?<title>[A-Za-z][\\p{L}0-9\\- ,]{1,250})\\s+(by|[-\u2013\u2014\\t])\\s+(?<author>[\\p{L}][\\p{L}\\.\\- ]{3,70})( \\(.*)?($|<\\/)'), + new RegexExtractor('Pattern: Author - Title', '(^|>)(?<author>[A-Za-z][\\p{L}0-9\\- ,]{1,250})\\s+[,-\u2013\u2014\\t]\\s+(?<title>[\\p{L}][\\p{L}\\.\\- ]{3,70})( \\(.*)?($|<\\/)'), + new RegexExtractor('Pattern: Title - Author', '(^|>)(?<title>[A-Za-z][\\p{L}0-9\\- ,]{1,250})\\s+[,-\u2013\u2014\\t]\\s+(?<author>[\\p{L}][\\p{L}\\.\\- ]{3,70})( \\(.*)?($|<\\/)'), + new RegexExtractor('Pattern: Title (Author)', '^(?<title>[\\p{L}].{1,250})\\s\\(?<author>(.{3,70})\\)$$'), + new RegexExtractor('Wikipedia Citation Pattern: (e.g. Baum, Frank L. (1994). The Wizard of Oz)', '^(?<author>[^.()]+).*?\\)\\. (?<title>[^.]+)'), new AiExtractor('✨ AI Extraction', 'gpt-4o-mini'), new TableExtractor('Extract from a Table/Spreadsheet') ] diff --git a/openlibrary/components/BulkSearch/utils/samples.js b/openlibrary/components/BulkSearch/utils/samples.js index 607cee41e25..e9c08adddf5 100644 --- a/openlibrary/components/BulkSearch/utils/samples.js +++ b/openlibrary/components/BulkSearch/utils/samples.js @@ -1,6 +1,6 @@ export const sampleData = [ { - name: 'Choose sample...', + name: 'Try a Sample...', source: '', text: '', }, diff --git a/openlibrary/core/lists/model.py b/openlibrary/core/lists/model.py index f75ecb68f6b..5a82a3a6577 100644 --- a/openlibrary/core/lists/model.py +++ b/openlibrary/core/lists/model.py @@ -14,7 +14,7 @@ from openlibrary.core import helpers as h from openlibrary.core import cache -from openlibrary.core.models import Image, Subject, Thing, ThingKey +from openlibrary.core.models import Image, Subject, Thing, ThingKey, ThingReferenceDict from openlibrary.plugins.upstream.models import Author, Changeset, Edition, User, Work from openlibrary.plugins.worksearch.search import get_solr @@ -24,10 +24,6 @@ logger = logging.getLogger("openlibrary.lists.model") -class ThingReferenceDict(TypedDict): - key: ThingKey - - SeedSubjectString = str """ When a subject is added to a list, it's added as a string like: diff --git a/openlibrary/core/models.py b/openlibrary/core/models.py index beb6a3f5991..4266a026627 100644 --- a/openlibrary/core/models.py +++ b/openlibrary/core/models.py @@ -8,7 +8,7 @@ import web import json import requests -from typing import Any +from typing import Any, TypedDict from collections import defaultdict from dataclasses import dataclass, field @@ -219,6 +219,10 @@ def _get_d(self): } +class ThingReferenceDict(TypedDict): + key: ThingKey + + class Edition(Thing): """Class to represent /type/edition objects in OL.""" diff --git a/openlibrary/macros/TableOfContents.html b/openlibrary/macros/TableOfContents.html index 5ca39eefce6..6a83a1fd25f 100644 --- a/openlibrary/macros/TableOfContents.html +++ b/openlibrary/macros/TableOfContents.html @@ -1,31 +1,38 @@ -$def with (table_of_contents, ocaid=None, highlighting=False, cls='', attrs='') +$def with (table_of_contents, ocaid=None, cls='', attrs='') $ min_level = min(chapter.level for chapter in table_of_contents) <div class="toc $cls" $:attrs> $for chapter in table_of_contents: - $ is_link = ocaid and chapter.pagenum and chapter.pagenum.isdigit() - $ tag = 'a' if is_link else 'div' - <$tag + <div class="toc__entry" - $:cond(is_link, 'href="//archive.org/details/%s/page/%s"' % (ocaid, chapter.pagenum)) - $:cond(is_link, 'data-ol-link-track="BookPage|TOCClick"') data-level="$chapter.level" style="margin-left:$((chapter.level - min_level) * 2)ch" > - <span class="toc__name"> - $ label = chapter.label - $if label and not label.endswith('.'): - $ label = label.strip() + '. ' - $if highlighting: - $# This isn't html injection, because solr returns everything already html escaped except for the em of the highlight - $:label - $:chapter.title - $else: - $label - $chapter.title - </span> - $if chapter.pagenum: - <span class="toc__dots" style="flex:1; border-bottom: 1px dotted;"></span> - <span class="toc__pagenum">$_('Page %s', chapter.pagenum)</span> - </$tag> + $ is_link = ocaid and chapter.pagenum and chapter.pagenum.isdigit() + $ tag = 'a' if is_link else 'div' + <$tag + class="toc__main" + $:cond(is_link, 'href="//archive.org/details/%s/page/%s"' % (ocaid, chapter.pagenum)) + $:cond(is_link, 'data-ol-link-track="BookPage|TOCClick"') + > + <div class="toc__name"> + $ label = chapter.label + $if label and not label.endswith('.'): + $ label = label.strip() + '. ' + + <div class="toc__title">$label $chapter.title</div> + + $if chapter.subtitle: + <div class="toc__subtitle">$chapter.subtitle</div> + $if chapter.authors: + <div class="toc__authors">$:macros.BookByline(chapter.authors)</div> + </div> + $if chapter.pagenum: + <span class="toc__dots"></span> + <span class="toc__pagenum">$_('Page %s', chapter.pagenum)</span> + </$tag> + + $if chapter.description: + <div class="toc__description">$chapter.description</div> + </div> </div> diff --git a/openlibrary/plugins/upstream/models.py b/openlibrary/plugins/upstream/models.py index 5c10f96b809..617fac7c5ac 100644 --- a/openlibrary/plugins/upstream/models.py +++ b/openlibrary/plugins/upstream/models.py @@ -17,6 +17,7 @@ from openlibrary.core.models import Image from openlibrary.core import lending +from openlibrary.plugins.upstream.table_of_contents import TocEntry from openlibrary.plugins.upstream.utils import MultiDict, parse_toc, get_edition_config from openlibrary.plugins.upstream import account from openlibrary.plugins.upstream import borrow @@ -414,24 +415,18 @@ def format_row(r): return "\n".join(format_row(r) for r in self.get_table_of_contents()) - def get_table_of_contents(self): + def get_table_of_contents(self) -> list[TocEntry]: def row(r): if isinstance(r, str): - level = 0 - label = "" - title = r - pagenum = "" + return TocEntry(level=0, title=r) else: - level = safeint(r.get('level', '0'), 0) - label = r.get('label', '') - title = r.get('title', '') - pagenum = r.get('pagenum', '') + return TocEntry.from_dict(r) - r = web.storage(level=level, label=label, title=title, pagenum=pagenum) - return r - - d = [row(r) for r in self.table_of_contents] - return [row for row in d if any(row.values())] + return [ + toc_entry + for r in self.table_of_contents + if not (toc_entry := row(r)).is_empty() + ] def set_toc_text(self, text): self.table_of_contents = parse_toc(text) diff --git a/openlibrary/plugins/upstream/table_of_contents.py b/openlibrary/plugins/upstream/table_of_contents.py new file mode 100644 index 00000000000..c648cd7ebc5 --- /dev/null +++ b/openlibrary/plugins/upstream/table_of_contents.py @@ -0,0 +1,40 @@ +from dataclasses import dataclass +from typing import TypedDict + +from openlibrary.core.models import ThingReferenceDict + + +class AuthorRecord(TypedDict): + name: str + author: ThingReferenceDict | None + + +@dataclass +class TocEntry: + level: int + label: str | None = None + title: str | None = None + pagenum: str | None = None + + authors: list[AuthorRecord] | None = None + subtitle: str | None = None + description: str | None = None + + @staticmethod + def from_dict(d: dict) -> 'TocEntry': + return TocEntry( + level=d.get('level', 0), + label=d.get('label'), + title=d.get('title'), + pagenum=d.get('pagenum'), + authors=d.get('authors'), + subtitle=d.get('subtitle'), + description=d.get('description'), + ) + + def is_empty(self) -> bool: + return all( + getattr(self, field) is None + for field in self.__annotations__ + if field != 'level' + ) diff --git a/static/css/components/toc.less b/static/css/components/toc.less index 5b19e4ed374..282e749ef75 100644 --- a/static/css/components/toc.less +++ b/static/css/components/toc.less @@ -2,28 +2,67 @@ @import (reference) "../less/colors.less"; .toc__entry { + line-height: 1.2em; + border-radius: 4px; padding: 3px 8px; @media only screen and (hover: none) { - padding: 6px 8px; /* Increase padding for touch-only devices */ + padding: 8px; /* Increase padding for touch-only devices */ + } + + .toc__main { + display: flex; + gap: 4px; + position: relative; + } + + .toc__dots { + flex: 1; + border-bottom: 1px dotted; + height: 1.2em; + } + + .toc__subtitle { + font-style: oblique; + color: @accessible-grey; } - display: flex; - gap: 4px; - line-height: 1.2em; - transition: background-color .2s; - em { - font-weight: bold; + .toc__subtitle, .toc__authors, .toc__description { + font-size: @font-size-label-large; + text-decoration: none; + } + + .toc__description { + margin-top: 5px; } } -a.toc__entry { +a.toc__main { text-decoration: none; - .toc__name { + border-radius: 4px; + + .toc__title { text-decoration: underline; } - &:hover { - background: rgba(0, 124, 255, .2); + + &:hover:after { + display: block; + content: ""; + inset: -2px; + position: absolute; + border-radius: 4px; + pointer-events: none; + animation: fade-in .2s; + background-color: rgba(0, 124, 255, .15); + } +} + +@keyframes fade-in { + from { + background-color: rgba(0, 124, 255, 0); + } + to { + background-color: rgba(0, 124, 255, .15); } } @@ -33,6 +72,10 @@ a.toc__entry { @media (max-width: @width-breakpoint-mobile) { .toc__entry { + padding: 8px 0; + } + + .toc__main { flex-direction: column; gap: 0; } @@ -42,7 +85,7 @@ a.toc__entry { } .toc__pagenum { - font-size: .9em; + font-size: @font-size-label-large; color: @accessible-grey; } }