web_cache: Initialize module

With support for fetching from the IA and caching in the local DB.
2022-09-20 21:33:55 +02:00 · 2022-09-20 21:33:55 +02:00 · 6f022e5f05
parent 5f3e9d6225
commit 6f022e5f05
4 changed files with 448 additions and 7 deletions
--- a/2
+++ b/2
@ -26,6 +26,6 @@ mypy:
 	mypy --show-error-codes opdb

 pytest:
-	pytest
+	pytest --doctest-modules

 .PHONY: black black-check isort isort-check mypy pytest test
--- a/opdb/web_cache.py
+++ b/opdb/web_cache.py
@ -0,0 +1,158 @@
+# This file is part of the Open Parts Database software
+# Copyright (C) 2022  Valentin Lorentz
+#
+# This program is free software: you can redistribute it and/or modify it under the
+# terms of the GNU Affero General Public License version 3, as published by the
+# Free Software Foundation.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT ANY
+# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.  See the GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License along with
+# this program.  If not, see <http://www.gnu.org/licenses/>.
+
+"""
+Management of the cache of external pages.
+
+This package fetches web pages, archives them in the Internet Archive for future
+citation, and caches them in the local database for quick access by other workers.
+"""
+
+import datetime
+import re
+import socket
+import typing
+
+import pkg_resources
+import requests
+
+from opdb.db import Db, models
+
+_OPDB_VERSION = pkg_resources.require("opdb")[0].version
+USER_AGENT = (
+    f"OPDB/{_OPDB_VERSION} (Open Parts Database cacher; +https://git.tf/opdb/opdb)"
+)
+
+_wayback_url_re = re.compile(
+    r"^https?://web\.archive\.org/web/(?P<timestamp>[0-9]{14})/(?P<origin_url>.+)$"
+)
+
+
+def _datetime_from_ia_timestamp(ia_timestamp: str) -> datetime.datetime:
+    """
+    >>> _datetime_from_ia_timestamp("20220919233014")
+    datetime.datetime(2022, 9, 19, 23, 30, 14, tzinfo=datetime.timezone.utc)
+    """
+    dt = datetime.datetime.strptime(ia_timestamp, "%Y%m%d%H%M%S")
+    # Assume it's UTC (neither the Wayback API nor the documentation mention
+    # timezones)
+    return dt.replace(tzinfo=datetime.timezone.utc)
+
+
+class Session:
+    """
+    Wrapper for :class:`requests.Session`, which tries to use pages cached locally in
+    the postgresql database, and falls back to downloading; making sure they are
+    archived in the Internet Archive.
+    """
+
+    def __init__(self, db: Db, min_snapshot_date: datetime.datetime):
+        self.min_snapshot_date = min_snapshot_date
+        self._db = db
+        self._session = requests.Session()
+        self._session.headers["User-Agent"] = USER_AGENT
+
+    def _fetch_newest_wayback_snapshot(
+        self, url: str
+    ) -> typing.Optional[models.WebPageSnapshot]:
+        """
+        If the URL is already archived in the Internet Archive (and newer than
+        configured with ``min_snapshot_date``), retrieves the latest snapshot available
+        via the Wayback Machine and returns it.
+        """
+        # API documentation: https://archive.org/help/wayback_api.php
+        response = self._session.get(
+            "https://archive.org/wayback/available", params={"url": url}
+        )
+        response.raise_for_status()  # TODO: retry
+
+        newest_ia_snapshot = (
+            response.json().get("archived_snapshots", {}).get("closest", {})
+        )
+        if not newest_ia_snapshot:
+            return None
+
+        ia_timestamp = newest_ia_snapshot["timestamp"]
+        snapshot_date = _datetime_from_ia_timestamp(ia_timestamp)
+
+        if snapshot_date < self.min_snapshot_date:
+            return None
+
+        wayback_url = newest_ia_snapshot["url"]
+        m = _wayback_url_re.match(wayback_url)
+        assert m, f"Unexpected Wayback URL format: {wayback_url}"
+        assert ia_timestamp == m.group(
+            "timestamp"
+        ), "Timestamp unexpectedly missing from snapshot URL: {wayback_url}"
+
+        return self._fetch_wayback_snapshot(url, wayback_url)
+
+    def _fetch_wayback_snapshot(
+        self, url: str, wayback_url: str
+    ) -> models.WebPageSnapshot:
+        # Add "id_" after the timestamp in the Wayback URL; it allows fetching the
+        # original page without the navigation header added by the Wayback Machine.
+        # Documented at https://archive.org/post/1044859/
+        m = _wayback_url_re.match(wayback_url)
+        assert m, f"Unexpected Wayback URL format: {wayback_url}"
+        ia_timestamp = m.group("timestamp")
+        snapshot_url = wayback_url.replace(ia_timestamp, ia_timestamp + "id_", 1)
+
+        response = self._session.get(snapshot_url)
+        response.raise_for_status()  # TODO: retry
+
+        return models.WebPageSnapshot(
+            url=url,
+            snapshot_date=_datetime_from_ia_timestamp(ia_timestamp),
+            snapshot_url=snapshot_url,
+            retrieved_at=datetime.datetime.now(tz=datetime.timezone.utc),
+            retrieved_by=socket.getfqdn(),
+            response_headers=dict(response.headers),
+            content=response.content,
+        )
+
+    def _save_page_now(self, url: str) -> models.WebPageSnapshot:
+        response = self._session.get(f"https://web.archive.org/save/{url}")
+        response.raise_for_status()  # TODO: retry
+        wayback_url = response.headers["Location"]
+        return self._fetch_wayback_snapshot(url, wayback_url)
+
+    def _get_cached_snapshot(self, url: str) -> typing.Optional[models.WebPageSnapshot]:
+        snapshot = self._db.get_last_web_page_snapshot(url)
+        if snapshot is None or snapshot.snapshot_date < self.min_snapshot_date:
+            return None
+
+        return snapshot
+
+    def get_or_fetch_snapshot(self, url: str) -> models.WebPageSnapshot:
+        """
+        Fetches the given URL from the local cache or from the Wayback Machine.
+
+        Requests archival by the Internet Archive if the Wayback Machine does not
+        already have the page available.
+        """
+        # First, try the local cache
+        snapshot = self._get_cached_snapshot(url)
+        if snapshot is not None:
+            return snapshot
+
+        # Then, try fetching from the Wayback Machine (and cache it locally)
+        snapshot = self._fetch_newest_wayback_snapshot(url)
+        if snapshot is not None:
+            self._db.add_web_page_snapshots([snapshot])
+            return snapshot
+
+        # If the Internet Archive does not have it yet, trigger its Save Code Now,
+        # and query the Wayback Machine again
+        return self._save_page_now(url)
--- a/opdb/web_cache_test.py
+++ b/opdb/web_cache_test.py
@ -0,0 +1,272 @@
+# This file is part of the Open Parts Database software
+# Copyright (C) 2022  Valentin Lorentz
+#
+# This program is free software: you can redistribute it and/or modify it under the
+# terms of the GNU Affero General Public License version 3, as published by the
+# Free Software Foundation.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT ANY
+# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.  See the GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License along with
+# this program.  If not, see <http://www.gnu.org/licenses/>.
+
+# pylint: disable=redefined-outer-name
+
+"""
+Test generic web page retrieval and caching
+"""
+
+import datetime
+import socket
+
+import pytest
+import requests_mock
+
+from opdb.db import Db, models
+from opdb.web_cache import Session
+
+
+@pytest.fixture
+def requests_mocker():
+    """Fixture wrapper for :mod:`requests_mock`"""
+    with requests_mock.mock() as m:
+        yield m
+
+
+SNAPSHOT_DATE = datetime.datetime(2022, 9, 20, 1, 49, 34, tzinfo=datetime.timezone.utc)
+
+
+@pytest.fixture
+def configured_requests_mocker(requests_mocker):
+    """Extension of :func:`requests_mocker` that registers a bunch of URLs of
+    the Wayback Machine API."""
+    requests_mocker.register_uri(
+        "GET",
+        "https://archive.org/wayback/available?url=http%3A%2F%2Fexample.org%2F",
+        complete_qs=True,
+        json={
+            "url": "http://example.org/",
+            "archived_snapshots": {
+                "closest": {
+                    "status": "200",
+                    "available": True,
+                    "url": (
+                        "http://web.archive.org/web/20220920014934/"
+                        "http://john.smith@example.org/"
+                    ),
+                    "timestamp": "20220920014934",
+                }
+            },
+        },
+    )
+
+    requests_mocker.register_uri(
+        "GET",
+        "http://web.archive.org/web/20220920014934id_/http://john.smith@example.org/",
+        complete_qs=True,
+        text="Example page content from Wayback Machine",
+    )
+
+    requests_mocker.register_uri(
+        "GET",
+        "https://web.archive.org/web/20220920164222id_/http://example.org/",
+        complete_qs=True,
+        text="Example page content from Wayback Machine after Save Page Now",
+    )
+
+    requests_mocker.register_uri(
+        "GET",
+        "https://web.archive.org/save/http://example.org/",
+        complete_qs=True,
+        headers={
+            "location": "https://web.archive.org/web/20220920164222/http://example.org/"
+        },
+        text="""
+        <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
+        <title>Redirecting...</title>
+        <h1>Redirecting...</h1>
+        """,
+    )
+
+    yield requests_mocker
+
+
+def test_get__cached(configured_requests_mocker, opdb_db: Db):
+    """
+    Tests getting a snapshot that is already in the local cache
+    """
+    after_date = SNAPSHOT_DATE - datetime.timedelta(days=1)
+    s = Session(opdb_db, after_date)
+
+    retrieved_at = datetime.datetime(2022, 2, 1, 0, 0, 0, tzinfo=datetime.timezone.utc)
+
+    snapshot = models.WebPageSnapshot(
+        url="http://example.org/",
+        snapshot_date=datetime.datetime.now(tz=datetime.timezone.utc),
+        snapshot_url=None,
+        retrieved_at=retrieved_at,
+        retrieved_by="localhost",
+        response_headers={"Content-Length": "19"},
+        content=b"Example page content",
+    )
+
+    opdb_db.add_web_page_snapshots([snapshot])
+
+    assert s.get_or_fetch_snapshot("http://example.org/") == snapshot
+
+    assert [(r.method, r.url) for r in configured_requests_mocker.request_history] == []
+
+
+def test_get__uncached__recent_wb(configured_requests_mocker, opdb_db: Db):
+    """
+    Tests getting a snapshot that is not in the local cache, but is available in
+    the Wayback Machine
+    """
+    after_date = SNAPSHOT_DATE - datetime.timedelta(days=1)
+    s = Session(opdb_db, after_date)
+
+    dt_before = datetime.datetime.now(tz=datetime.timezone.utc)
+    snapshot = s.get_or_fetch_snapshot("http://example.org/")
+    dt_after = datetime.datetime.now(tz=datetime.timezone.utc)
+
+    assert dt_before <= snapshot.retrieved_at <= dt_after
+
+    assert snapshot == models.WebPageSnapshot(
+        url="http://example.org/",
+        snapshot_date=datetime.datetime(
+            2022, 9, 20, 1, 49, 34, tzinfo=datetime.timezone.utc
+        ),
+        snapshot_url=(
+            "http://web.archive.org/web/20220920014934id_/"
+            "http://john.smith@example.org/"
+        ),
+        retrieved_at=snapshot.retrieved_at,
+        retrieved_by=socket.getfqdn(),
+        response_headers={},
+        content=b"Example page content from Wayback Machine",
+    )
+
+    assert [(r.method, r.url) for r in configured_requests_mocker.request_history] == [
+        (
+            "GET",
+            "https://archive.org/wayback/available?url=http%3A%2F%2Fexample.org%2F",
+        ),
+        (
+            "GET",
+            "http://web.archive.org/web/20220920014934id_/"
+            "http://john.smith@example.org/",
+        ),
+    ]
+
+
+def test_get__expired_cache__recent_wb(configured_requests_mocker, opdb_db: Db):
+    """
+    Tests getting a snapshot that is expired in the local cache, but is available in
+    the Wayback Machine
+    """
+    snapshoted_at = SNAPSHOT_DATE - datetime.timedelta(days=2)
+    retrieved_at = SNAPSHOT_DATE + datetime.timedelta(days=1)
+
+    snapshot = models.WebPageSnapshot(
+        url="http://example.org/",
+        snapshot_date=snapshoted_at,
+        snapshot_url=None,
+        retrieved_at=retrieved_at,
+        retrieved_by="localhost",
+        response_headers={"Content-Length": "16"},
+        content=b"Old page content",
+    )
+
+    opdb_db.add_web_page_snapshots([snapshot])
+
+    # Reuse the other test; web_cache.Session should simply ignore the outdated snapshot
+    test_get__uncached__recent_wb(configured_requests_mocker, opdb_db)
+
+
+def test_get__uncached__expired_wb(configured_requests_mocker, opdb_db: Db):
+    """
+    Tests getting a snapshot that is not in the local cache, and expired in
+    the Wayback Machine -> uses Save Page Now
+    """
+    after_date = SNAPSHOT_DATE + datetime.timedelta(days=1)
+    s = Session(opdb_db, after_date)
+
+    dt_before = datetime.datetime.now(tz=datetime.timezone.utc)
+    snapshot = s.get_or_fetch_snapshot("http://example.org/")
+    dt_after = datetime.datetime.now(tz=datetime.timezone.utc)
+
+    assert dt_before <= snapshot.retrieved_at <= dt_after
+
+    assert snapshot == models.WebPageSnapshot(
+        url="http://example.org/",
+        snapshot_date=datetime.datetime(
+            2022, 9, 20, 16, 42, 22, tzinfo=datetime.timezone.utc
+        ),
+        snapshot_url=(
+            "https://web.archive.org/web/20220920164222id_/"  # SPN returns HTTPS URLs
+            "http://example.org/"
+        ),
+        retrieved_at=snapshot.retrieved_at,
+        retrieved_by=socket.getfqdn(),
+        response_headers={},
+        content=b"Example page content from Wayback Machine after Save Page Now",
+    )
+
+    assert [(r.method, r.url) for r in configured_requests_mocker.request_history] == [
+        (
+            "GET",
+            "https://archive.org/wayback/available?url=http%3A%2F%2Fexample.org%2F",
+        ),
+        (
+            "GET",
+            "https://web.archive.org/save/http://example.org/",
+        ),
+        (
+            "GET",
+            "https://web.archive.org/web/20220920164222id_/"  # ditto
+            "http://example.org/",
+        ),
+    ]
+
+
+def test_get__expired_cache__expired_wb(configured_requests_mocker, opdb_db: Db):
+    """
+    Tests getting a snapshot that is expired in the local cache, and expired in
+    the Wayback Machine -> uses Save Page Now
+    """
+    snapshoted_at = SNAPSHOT_DATE - datetime.timedelta(days=2)
+    retrieved_at = SNAPSHOT_DATE + datetime.timedelta(days=1)
+
+    snapshot = models.WebPageSnapshot(
+        url="http://example.org/",
+        snapshot_date=snapshoted_at,
+        snapshot_url=None,
+        retrieved_at=retrieved_at,
+        retrieved_by="localhost",
+        response_headers={"Content-Length": "16"},
+        content=b"Old page content",
+    )
+
+    opdb_db.add_web_page_snapshots([snapshot])
+
+    # Reuse the other test; web_cache.Session should simply ignore the outdated snapshot
+    test_get__uncached__expired_wb(configured_requests_mocker, opdb_db)
+
+
+def test_get__expired_cache__no_wb(configured_requests_mocker, opdb_db: Db):
+    """
+    Tests getting a snapshot that is expired in the local cache, and absent from
+    the Wayback Machine -> uses Save Page Now
+    """
+    configured_requests_mocker.register_uri(
+        "GET",
+        "https://archive.org/wayback/available?url=http%3A%2F%2Fexample.org%2F",
+        complete_qs=True,
+        json={"url": "http://example.org/", "archived_snapshots": {}},
+    )
+
+    # Reuse the other test; web_cache.Session should treat the absence of a page
+    # exactly the same way as an expired one.
+    test_get__uncached__expired_wb(configured_requests_mocker, opdb_db)
--- a/pyproject.toml
+++ b/pyproject.toml
@ -5,16 +5,20 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "opdb"
 version = "0.0.1"
+requires-python = ">=3.9"
 dependencies = [
-    "luigi == 3",
-    "psycopg == 3",
-    "requests == 2",
+    "luigi == 3.*",
+    "psycopg == 3.*",
+    "requests == 2.*",
 ]

 [project.optional-dependencies]
 testing = [
    "pytest",
    "pytest-postgresql",
+    "requests-mock",
+    "types-requests",
+    "types-setuptools",
 ]

 [tool.isort]
@ -23,15 +27,22 @@ profile = "black"
 [tool.mypy]
 python_version = "3.9"

+[[tool.mypy.overrides]]
+module = [
+    "requests_mock",
+]
+ignore_missing_imports = true
+
 [tool.pylint.format]
 max-line-length = "88"
 py-version = "3.9"
 disable = [
    # too annoying:
-    "no-else-return",
-    "too-many-instance-attributes",
-    "invalid-name",
    "fixme",
+    "invalid-name",
+    "no-else-return",
+    "too-few-public-methods",
+    "too-many-instance-attributes",
    # mypy does it better:
    "no-member",
    "import-error",