Add optional support for the authenticated SPN API

It has much higher rate limits.
2022-09-25 09:49:01 +02:00 · 2022-09-25 09:49:01 +02:00 · 1eb169ee6c
parent 1db60954d6
commit 1eb169ee6c
5 changed files with 241 additions and 34 deletions
--- a/.woodpecker.yml
+++ b/.woodpecker.yml
@ -4,7 +4,7 @@ pipeline:
    image: python:3.9
    commands:
      - pip3 install flake8
-      - make flake8
+      - flake8 opdb/
  pylint:
    group: lint
    image: cytopia/pylint
--- a/2
+++ b/2
@ -10,7 +10,7 @@ black-check:
 black:
 	black opdb/

-flake8:
+flake8: black
 	flake8 opdb/

 pylint:
--- a/opdb/web_cache.py
+++ b/opdb/web_cache.py
@ -22,6 +22,7 @@ citation, and caches them in the local database for quick access by other worker
 import datetime
 import re
 import socket
+import time
 import typing

 import pkg_resources
@ -57,11 +58,14 @@ class Session:
    archived in the Internet Archive.
    """

-    def __init__(self, db: Db, min_snapshot_date: datetime.datetime):
+    def __init__(
+        self, db: Db, min_snapshot_date: datetime.datetime, ias3_auth: str = None
+    ):
        self.min_snapshot_date = min_snapshot_date
        self._db = db
        self._session = requests.Session()
        self._session.headers["User-Agent"] = USER_AGENT
+        self._ias3_auth = ias3_auth

    def _fetch_newest_wayback_snapshot(
        self, url: str
@ -122,12 +126,88 @@ class Session:
            content=response.content,
        )

-    def _save_page_now(self, url: str) -> models.WebPageSnapshot:
-        response = self._session.get(
-            f"https://web.archive.org/save/{url}", allow_redirects=False
+    def _save_page_now(self, url: str) -> typing.Optional[models.WebPageSnapshot]:
+        if self._ias3_auth is None:
+            return self._anonymous_save_page_now(url)
+        else:
+            return self._authenticated_save_page_now(url, self._ias3_auth)
+
+    def _anonymous_save_page_now(
+        self, url: str
+    ) -> typing.Optional[models.WebPageSnapshot]:
+        while True:
+            try:
+                response = self._session.get(
+                    f"https://web.archive.org/save/{url}",
+                    allow_redirects=False,
+                )
+                response.raise_for_status()
+            except requests.exceptions.HTTPError as e:
+                if e.response.status_code == 429:
+                    print(e)
+                    print("Sleeping...")
+                    time.sleep(10)
+                    continue
+                elif e.response.status_code == 520:
+                    # "Job failed". We will try again in the next workflow run.
+                    return None
+                else:
+                    raise
+            else:
+                wayback_url = response.headers["Location"]
+                return self._fetch_wayback_snapshot(url, wayback_url)
+
+    def _authenticated_save_page_now(
+        self, url: str, ias3_auth: str
+    ) -> typing.Optional[models.WebPageSnapshot]:
+        for _ in range(3):
+            response = self._session.post(
+                "https://web.archive.org/save/",
+                allow_redirects=False,
+                data={"url": url},
+                headers={
+                    "Accept": "application/json",
+                    "Authorization": f"LOW {ias3_auth}",
+                },
+            )
+            response.raise_for_status()
+            job_id = response.json()["job_id"]
+
+            status = "pending"
+            while status == "pending":
+                time.sleep(5)
+                response = self._session.get(
+                    f"https://web.archive.org/save/status/{job_id}"
+                )
+                response.raise_for_status()
+                status = response.json()["status"]
+
+            if status == "success":
+                break
+
+            if response.json()["status_ext"] == "error:service-unavailable":
+                print(response.json()["message"])
+                time.sleep(10)
+                continue  # retry
+            elif response.json()["status_ext"] == "error:too-many-daily-captures":
+                # typically happens when a page repeatedly fails so we
+                # (unsuccessfully) tried to capture it too many times
+                return None
+
+            assert False, response.json()
+        else:
+            print("Too many failures; giving up.")
+            return None
+
+        ia_timestamp = response.json()["timestamp"]
+        snapshot_date = _datetime_from_ia_timestamp(ia_timestamp)
+
+        assert snapshot_date >= self.min_snapshot_date, (
+            snapshot_date.isoformat(),
+            self.min_snapshot_date.isoformat(),
        )
-        response.raise_for_status()  # TODO: retry
-        wayback_url = response.headers["Location"]
+
+        wayback_url = f"https://web.archive.org/web/{ia_timestamp}/{url}"
        return self._fetch_wayback_snapshot(url, wayback_url)

    def _get_cached_snapshot(self, url: str) -> typing.Optional[models.WebPageSnapshot]:
@ -137,7 +217,9 @@ class Session:

        return snapshot

-    def get_or_fetch_snapshot(self, url: str) -> models.WebPageSnapshot:
+    def get_or_fetch_snapshot(
+        self, url: str
+    ) -> typing.Optional[models.WebPageSnapshot]:
        """
        Fetches the given URL from the local cache or from the Wayback Machine.

@ -158,5 +240,6 @@ class Session:
        # If the Internet Archive does not have it yet, trigger its Save Code Now,
        # and query the Wayback Machine again
        snapshot = self._save_page_now(url)
-        self._db.add_web_page_snapshots([snapshot])
+        if snapshot is not None:
+            self._db.add_web_page_snapshots([snapshot])
        return snapshot
--- a/opdb/web_cache_test.py
+++ b/opdb/web_cache_test.py
@ -38,6 +38,26 @@ def requests_mocker():
 SNAPSHOT_DATE = datetime.datetime(2022, 9, 20, 1, 49, 34, tzinfo=datetime.timezone.utc)


+@pytest.fixture
+def mock_time(mocker):
+    """
+    Makes time.sleep return immediately, and returns a callable that returns the
+    total offset introduced by time.sleep calls.
+    """
+    offset = 0
+
+    def sleep(seconds):
+        nonlocal offset
+        offset += seconds
+
+    mocker.patch("time.sleep", side_effect=sleep)
+
+    def get_offset():
+        return offset
+
+    return get_offset
+
+
@pytest.fixture
 def configured_requests_mocker(requests_mocker):
    """Extension of :func:`requests_mocker` that registers a bunch of URLs of
@ -75,13 +95,20 @@ def configured_requests_mocker(requests_mocker):
        complete_qs=True,
        text="Example page content from Wayback Machine after Save Page Now",
    )
+    requests_mocker.register_uri(
+        "GET",
+        "https://web.archive.org/web/20220922000000id_/http://example.org/",
+        complete_qs=True,
+        text="Example page content from Wayback Machine after Save Page Now",
+    )

+    # Anonymous SPN:
    requests_mocker.register_uri(
        "GET",
        "https://web.archive.org/save/http://example.org/",
        complete_qs=True,
        headers={
-            "location": "https://web.archive.org/web/20220920164222/http://example.org/"
+            "location": "https://web.archive.org/web/20220922000000/http://example.org/"
        },
        text="""
        <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
@ -90,6 +117,37 @@ def configured_requests_mocker(requests_mocker):
        """,
    )

+    # Authenticated SPN:
+    requests_mocker.register_uri(
+        "POST",
+        "https://web.archive.org/save/",
+        complete_qs=True,
+        text='{"url": "http://example.org/", "job_id": "spn2-abcde"}',
+    )
+    requests_mocker.register_uri(
+        "GET",
+        "https://web.archive.org/save/status/spn2-abcde",
+        [
+            dict(
+                text="""
+                    {"job_id":"spn2-abcde","resources":["blah"],"status":"pending"}
+                """
+            ),
+            dict(
+                text="""
+                    {"job_id":"spn2-abcde","resources":["blah"],"status":"pending"}
+                """
+            ),
+            dict(
+                text="""
+                    {"job_id":"spn2-abcde","resources":["blah"],"status":"success",
+                    "timestamp":"20220922000000"}
+                """
+            ),
+        ],
+        complete_qs=True,
+    )
+
    yield requests_mocker


@ -131,6 +189,7 @@ def test_get__uncached__recent_wb(configured_requests_mocker, opdb_db: Db):
    snapshot = s.get_or_fetch_snapshot("http://example.org/")
    dt_after = datetime.datetime.now(tz=datetime.timezone.utc)

+    assert snapshot is not None
    assert dt_before <= snapshot.retrieved_at <= dt_after

    assert snapshot == models.WebPageSnapshot(
@ -187,27 +246,51 @@ def test_get__expired_cache__recent_wb(configured_requests_mocker, opdb_db: Db):
    test_get__uncached__recent_wb(configured_requests_mocker, opdb_db)


-def test_get__uncached__expired_wb(configured_requests_mocker, opdb_db: Db):
+@pytest.fixture(
+    params=[
+        pytest.param(False, id="anonymous"),
+        pytest.param(True, id="authenticated"),
+    ],
+)
+def authenticated(request) -> bool:
+    """Parametrized by False/True"""
+    return request.param
+
+
+def test_get__uncached__expired_wb(
+    configured_requests_mocker, opdb_db: Db, authenticated: bool, mock_time
+):
    """
    Tests getting a snapshot that is not in the local cache, and expired in
    the Wayback Machine -> uses Save Page Now
    """
    after_date = SNAPSHOT_DATE + datetime.timedelta(days=1)
-    s = Session(opdb_db, after_date)
+    if authenticated:
+        s = Session(opdb_db, after_date, ias3_auth="akey:skey")
+    else:
+        s = Session(opdb_db, after_date)
+
+    assert mock_time() == 0

    dt_before = datetime.datetime.now(tz=datetime.timezone.utc)
    snapshot = s.get_or_fetch_snapshot("http://example.org/")
    dt_after = datetime.datetime.now(tz=datetime.timezone.utc)

+    if authenticated:
+        assert mock_time() == 15  # three time.sleep(5) calls
+    else:
+        assert mock_time() == 0  # no sleep() because blocking on SPN GET request
+
+    assert snapshot is not None
    assert dt_before <= snapshot.retrieved_at <= dt_after

    assert snapshot == models.WebPageSnapshot(
        url="http://example.org/",
        snapshot_date=datetime.datetime(
-            2022, 9, 20, 16, 42, 22, tzinfo=datetime.timezone.utc
+            2022, 9, 22, 0, 0, 0, tzinfo=datetime.timezone.utc
        ),
        snapshot_url=(
-            "https://web.archive.org/web/20220920164222id_/"  # SPN returns HTTPS URLs
+            "https://web.archive.org/web/20220922000000id_/"  # SPN returns HTTPS URLs
            "http://example.org/"
        ),
        retrieved_at=snapshot.retrieved_at,
@ -218,24 +301,54 @@ def test_get__uncached__expired_wb(configured_requests_mocker, opdb_db: Db):

    assert snapshot == opdb_db.get_last_web_page_snapshot("http://example.org/")

-    assert [(r.method, r.url) for r in configured_requests_mocker.request_history] == [
-        (
-            "GET",
-            "https://archive.org/wayback/available?url=http%3A%2F%2Fexample.org%2F",
-        ),
-        (
-            "GET",
-            "https://web.archive.org/save/http://example.org/",
-        ),
-        (
-            "GET",
-            "https://web.archive.org/web/20220920164222id_/"  # ditto
-            "http://example.org/",
-        ),
-    ]
+    if authenticated:
+        assert [
+            (r.method, r.url, r.body)
+            for r in configured_requests_mocker.request_history
+        ] == [
+            (
+                "GET",
+                "https://archive.org/wayback/available?url=http%3A%2F%2Fexample.org%2F",
+                None,
+            ),
+            (
+                "POST",
+                "https://web.archive.org/save/",
+                "url=http%3A%2F%2Fexample.org%2F",
+            ),
+            ("GET", "https://web.archive.org/save/status/spn2-abcde", None),
+            ("GET", "https://web.archive.org/save/status/spn2-abcde", None),
+            ("GET", "https://web.archive.org/save/status/spn2-abcde", None),
+            (
+                "GET",
+                "https://web.archive.org/web/20220922000000id_/"  # ditto
+                "http://example.org/",
+                None,
+            ),
+        ]
+    else:
+        assert [
+            (r.method, r.url) for r in configured_requests_mocker.request_history
+        ] == [
+            (
+                "GET",
+                "https://archive.org/wayback/available?url=http%3A%2F%2Fexample.org%2F",
+            ),
+            (
+                "GET",
+                "https://web.archive.org/save/http://example.org/",
+            ),
+            (
+                "GET",
+                "https://web.archive.org/web/20220922000000id_/"  # ditto
+                "http://example.org/",
+            ),
+        ]


-def test_get__expired_cache__expired_wb(configured_requests_mocker, opdb_db: Db):
+def test_get__expired_cache__expired_wb(
+    configured_requests_mocker, opdb_db: Db, authenticated: bool, mock_time
+):
    """
    Tests getting a snapshot that is expired in the local cache, and expired in
    the Wayback Machine -> uses Save Page Now
@ -256,10 +369,14 @@ def test_get__expired_cache__expired_wb(configured_requests_mocker, opdb_db: Db)
    opdb_db.add_web_page_snapshots([snapshot])

    # Reuse the other test; web_cache.Session should simply ignore the outdated snapshot
-    test_get__uncached__expired_wb(configured_requests_mocker, opdb_db)
+    test_get__uncached__expired_wb(
+        configured_requests_mocker, opdb_db, authenticated, mock_time
+    )


-def test_get__expired_cache__no_wb(configured_requests_mocker, opdb_db: Db):
+def test_get__expired_cache__no_wb(
+    configured_requests_mocker, opdb_db: Db, authenticated: bool, mock_time
+):
    """
    Tests getting a snapshot that is expired in the local cache, and absent from
    the Wayback Machine -> uses Save Page Now
@ -273,4 +390,6 @@ def test_get__expired_cache__no_wb(configured_requests_mocker, opdb_db: Db):

    # Reuse the other test; web_cache.Session should treat the absence of a page
    # exactly the same way as an expired one.
-    test_get__uncached__expired_wb(configured_requests_mocker, opdb_db)
+    test_get__uncached__expired_wb(
+        configured_requests_mocker, opdb_db, authenticated, mock_time
+    )
--- a/pyproject.toml
+++ b/pyproject.toml
@ -15,6 +15,7 @@ dependencies = [
 [project.optional-dependencies]
 testing = [
    "pytest",
+    "pytest-mock",
    "pytest-postgresql",
    "requests-mock",
    "types-requests",
@ -41,8 +42,12 @@ disable = [
    "fixme",
    "invalid-name",
    "no-else-return",
+    "no-else-continue",
    "too-few-public-methods",
    "too-many-instance-attributes",
+    # false positives:
+    "unreachable",
+    "assignment-from-no-return",
    # mypy does it better:
    "no-member",
    "import-error",