web_cache: Initialize module
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful

With support for fetching from the IA and caching in the local DB.
This commit is contained in:
Val Lorentz 2022-09-20 21:33:55 +02:00
parent 5f3e9d6225
commit 6f022e5f05
4 changed files with 448 additions and 7 deletions

View File

@ -26,6 +26,6 @@ mypy:
mypy --show-error-codes opdb
pytest:
pytest
pytest --doctest-modules
.PHONY: black black-check isort isort-check mypy pytest test

158
opdb/web_cache.py Normal file
View File

@ -0,0 +1,158 @@
# This file is part of the Open Parts Database software
# Copyright (C) 2022 Valentin Lorentz
#
# This program is free software: you can redistribute it and/or modify it under the
# terms of the GNU Affero General Public License version 3, as published by the
# Free Software Foundation.
#
# This program is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
# PARTICULAR PURPOSE. See the GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License along with
# this program. If not, see <http://www.gnu.org/licenses/>.
"""
Management of the cache of external pages.
This package fetches web pages, archives them in the Internet Archive for future
citation, and caches them in the local database for quick access by other workers.
"""
import datetime
import re
import socket
import typing
import pkg_resources
import requests
from opdb.db import Db, models
_OPDB_VERSION = pkg_resources.require("opdb")[0].version
USER_AGENT = (
f"OPDB/{_OPDB_VERSION} (Open Parts Database cacher; +https://git.tf/opdb/opdb)"
)
_wayback_url_re = re.compile(
r"^https?://web\.archive\.org/web/(?P<timestamp>[0-9]{14})/(?P<origin_url>.+)$"
)
def _datetime_from_ia_timestamp(ia_timestamp: str) -> datetime.datetime:
"""
>>> _datetime_from_ia_timestamp("20220919233014")
datetime.datetime(2022, 9, 19, 23, 30, 14, tzinfo=datetime.timezone.utc)
"""
dt = datetime.datetime.strptime(ia_timestamp, "%Y%m%d%H%M%S")
# Assume it's UTC (neither the Wayback API nor the documentation mention
# timezones)
return dt.replace(tzinfo=datetime.timezone.utc)
class Session:
"""
Wrapper for :class:`requests.Session`, which tries to use pages cached locally in
the postgresql database, and falls back to downloading; making sure they are
archived in the Internet Archive.
"""
def __init__(self, db: Db, min_snapshot_date: datetime.datetime):
self.min_snapshot_date = min_snapshot_date
self._db = db
self._session = requests.Session()
self._session.headers["User-Agent"] = USER_AGENT
def _fetch_newest_wayback_snapshot(
self, url: str
) -> typing.Optional[models.WebPageSnapshot]:
"""
If the URL is already archived in the Internet Archive (and newer than
configured with ``min_snapshot_date``), retrieves the latest snapshot available
via the Wayback Machine and returns it.
"""
# API documentation: https://archive.org/help/wayback_api.php
response = self._session.get(
"https://archive.org/wayback/available", params={"url": url}
)
response.raise_for_status() # TODO: retry
newest_ia_snapshot = (
response.json().get("archived_snapshots", {}).get("closest", {})
)
if not newest_ia_snapshot:
return None
ia_timestamp = newest_ia_snapshot["timestamp"]
snapshot_date = _datetime_from_ia_timestamp(ia_timestamp)
if snapshot_date < self.min_snapshot_date:
return None
wayback_url = newest_ia_snapshot["url"]
m = _wayback_url_re.match(wayback_url)
assert m, f"Unexpected Wayback URL format: {wayback_url}"
assert ia_timestamp == m.group(
"timestamp"
), "Timestamp unexpectedly missing from snapshot URL: {wayback_url}"
return self._fetch_wayback_snapshot(url, wayback_url)
def _fetch_wayback_snapshot(
self, url: str, wayback_url: str
) -> models.WebPageSnapshot:
# Add "id_" after the timestamp in the Wayback URL; it allows fetching the
# original page without the navigation header added by the Wayback Machine.
# Documented at https://archive.org/post/1044859/
m = _wayback_url_re.match(wayback_url)
assert m, f"Unexpected Wayback URL format: {wayback_url}"
ia_timestamp = m.group("timestamp")
snapshot_url = wayback_url.replace(ia_timestamp, ia_timestamp + "id_", 1)
response = self._session.get(snapshot_url)
response.raise_for_status() # TODO: retry
return models.WebPageSnapshot(
url=url,
snapshot_date=_datetime_from_ia_timestamp(ia_timestamp),
snapshot_url=snapshot_url,
retrieved_at=datetime.datetime.now(tz=datetime.timezone.utc),
retrieved_by=socket.getfqdn(),
response_headers=dict(response.headers),
content=response.content,
)
def _save_page_now(self, url: str) -> models.WebPageSnapshot:
response = self._session.get(f"https://web.archive.org/save/{url}")
response.raise_for_status() # TODO: retry
wayback_url = response.headers["Location"]
return self._fetch_wayback_snapshot(url, wayback_url)
def _get_cached_snapshot(self, url: str) -> typing.Optional[models.WebPageSnapshot]:
snapshot = self._db.get_last_web_page_snapshot(url)
if snapshot is None or snapshot.snapshot_date < self.min_snapshot_date:
return None
return snapshot
def get_or_fetch_snapshot(self, url: str) -> models.WebPageSnapshot:
"""
Fetches the given URL from the local cache or from the Wayback Machine.
Requests archival by the Internet Archive if the Wayback Machine does not
already have the page available.
"""
# First, try the local cache
snapshot = self._get_cached_snapshot(url)
if snapshot is not None:
return snapshot
# Then, try fetching from the Wayback Machine (and cache it locally)
snapshot = self._fetch_newest_wayback_snapshot(url)
if snapshot is not None:
self._db.add_web_page_snapshots([snapshot])
return snapshot
# If the Internet Archive does not have it yet, trigger its Save Code Now,
# and query the Wayback Machine again
return self._save_page_now(url)

272
opdb/web_cache_test.py Normal file
View File

@ -0,0 +1,272 @@
# This file is part of the Open Parts Database software
# Copyright (C) 2022 Valentin Lorentz
#
# This program is free software: you can redistribute it and/or modify it under the
# terms of the GNU Affero General Public License version 3, as published by the
# Free Software Foundation.
#
# This program is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
# PARTICULAR PURPOSE. See the GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License along with
# this program. If not, see <http://www.gnu.org/licenses/>.
# pylint: disable=redefined-outer-name
"""
Test generic web page retrieval and caching
"""
import datetime
import socket
import pytest
import requests_mock
from opdb.db import Db, models
from opdb.web_cache import Session
@pytest.fixture
def requests_mocker():
"""Fixture wrapper for :mod:`requests_mock`"""
with requests_mock.mock() as m:
yield m
SNAPSHOT_DATE = datetime.datetime(2022, 9, 20, 1, 49, 34, tzinfo=datetime.timezone.utc)
@pytest.fixture
def configured_requests_mocker(requests_mocker):
"""Extension of :func:`requests_mocker` that registers a bunch of URLs of
the Wayback Machine API."""
requests_mocker.register_uri(
"GET",
"https://archive.org/wayback/available?url=http%3A%2F%2Fexample.org%2F",
complete_qs=True,
json={
"url": "http://example.org/",
"archived_snapshots": {
"closest": {
"status": "200",
"available": True,
"url": (
"http://web.archive.org/web/20220920014934/"
"http://john.smith@example.org/"
),
"timestamp": "20220920014934",
}
},
},
)
requests_mocker.register_uri(
"GET",
"http://web.archive.org/web/20220920014934id_/http://john.smith@example.org/",
complete_qs=True,
text="Example page content from Wayback Machine",
)
requests_mocker.register_uri(
"GET",
"https://web.archive.org/web/20220920164222id_/http://example.org/",
complete_qs=True,
text="Example page content from Wayback Machine after Save Page Now",
)
requests_mocker.register_uri(
"GET",
"https://web.archive.org/save/http://example.org/",
complete_qs=True,
headers={
"location": "https://web.archive.org/web/20220920164222/http://example.org/"
},
text="""
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
<title>Redirecting...</title>
<h1>Redirecting...</h1>
""",
)
yield requests_mocker
def test_get__cached(configured_requests_mocker, opdb_db: Db):
"""
Tests getting a snapshot that is already in the local cache
"""
after_date = SNAPSHOT_DATE - datetime.timedelta(days=1)
s = Session(opdb_db, after_date)
retrieved_at = datetime.datetime(2022, 2, 1, 0, 0, 0, tzinfo=datetime.timezone.utc)
snapshot = models.WebPageSnapshot(
url="http://example.org/",
snapshot_date=datetime.datetime.now(tz=datetime.timezone.utc),
snapshot_url=None,
retrieved_at=retrieved_at,
retrieved_by="localhost",
response_headers={"Content-Length": "19"},
content=b"Example page content",
)
opdb_db.add_web_page_snapshots([snapshot])
assert s.get_or_fetch_snapshot("http://example.org/") == snapshot
assert [(r.method, r.url) for r in configured_requests_mocker.request_history] == []
def test_get__uncached__recent_wb(configured_requests_mocker, opdb_db: Db):
"""
Tests getting a snapshot that is not in the local cache, but is available in
the Wayback Machine
"""
after_date = SNAPSHOT_DATE - datetime.timedelta(days=1)
s = Session(opdb_db, after_date)
dt_before = datetime.datetime.now(tz=datetime.timezone.utc)
snapshot = s.get_or_fetch_snapshot("http://example.org/")
dt_after = datetime.datetime.now(tz=datetime.timezone.utc)
assert dt_before <= snapshot.retrieved_at <= dt_after
assert snapshot == models.WebPageSnapshot(
url="http://example.org/",
snapshot_date=datetime.datetime(
2022, 9, 20, 1, 49, 34, tzinfo=datetime.timezone.utc
),
snapshot_url=(
"http://web.archive.org/web/20220920014934id_/"
"http://john.smith@example.org/"
),
retrieved_at=snapshot.retrieved_at,
retrieved_by=socket.getfqdn(),
response_headers={},
content=b"Example page content from Wayback Machine",
)
assert [(r.method, r.url) for r in configured_requests_mocker.request_history] == [
(
"GET",
"https://archive.org/wayback/available?url=http%3A%2F%2Fexample.org%2F",
),
(
"GET",
"http://web.archive.org/web/20220920014934id_/"
"http://john.smith@example.org/",
),
]
def test_get__expired_cache__recent_wb(configured_requests_mocker, opdb_db: Db):
"""
Tests getting a snapshot that is expired in the local cache, but is available in
the Wayback Machine
"""
snapshoted_at = SNAPSHOT_DATE - datetime.timedelta(days=2)
retrieved_at = SNAPSHOT_DATE + datetime.timedelta(days=1)
snapshot = models.WebPageSnapshot(
url="http://example.org/",
snapshot_date=snapshoted_at,
snapshot_url=None,
retrieved_at=retrieved_at,
retrieved_by="localhost",
response_headers={"Content-Length": "16"},
content=b"Old page content",
)
opdb_db.add_web_page_snapshots([snapshot])
# Reuse the other test; web_cache.Session should simply ignore the outdated snapshot
test_get__uncached__recent_wb(configured_requests_mocker, opdb_db)
def test_get__uncached__expired_wb(configured_requests_mocker, opdb_db: Db):
"""
Tests getting a snapshot that is not in the local cache, and expired in
the Wayback Machine -> uses Save Page Now
"""
after_date = SNAPSHOT_DATE + datetime.timedelta(days=1)
s = Session(opdb_db, after_date)
dt_before = datetime.datetime.now(tz=datetime.timezone.utc)
snapshot = s.get_or_fetch_snapshot("http://example.org/")
dt_after = datetime.datetime.now(tz=datetime.timezone.utc)
assert dt_before <= snapshot.retrieved_at <= dt_after
assert snapshot == models.WebPageSnapshot(
url="http://example.org/",
snapshot_date=datetime.datetime(
2022, 9, 20, 16, 42, 22, tzinfo=datetime.timezone.utc
),
snapshot_url=(
"https://web.archive.org/web/20220920164222id_/" # SPN returns HTTPS URLs
"http://example.org/"
),
retrieved_at=snapshot.retrieved_at,
retrieved_by=socket.getfqdn(),
response_headers={},
content=b"Example page content from Wayback Machine after Save Page Now",
)
assert [(r.method, r.url) for r in configured_requests_mocker.request_history] == [
(
"GET",
"https://archive.org/wayback/available?url=http%3A%2F%2Fexample.org%2F",
),
(
"GET",
"https://web.archive.org/save/http://example.org/",
),
(
"GET",
"https://web.archive.org/web/20220920164222id_/" # ditto
"http://example.org/",
),
]
def test_get__expired_cache__expired_wb(configured_requests_mocker, opdb_db: Db):
"""
Tests getting a snapshot that is expired in the local cache, and expired in
the Wayback Machine -> uses Save Page Now
"""
snapshoted_at = SNAPSHOT_DATE - datetime.timedelta(days=2)
retrieved_at = SNAPSHOT_DATE + datetime.timedelta(days=1)
snapshot = models.WebPageSnapshot(
url="http://example.org/",
snapshot_date=snapshoted_at,
snapshot_url=None,
retrieved_at=retrieved_at,
retrieved_by="localhost",
response_headers={"Content-Length": "16"},
content=b"Old page content",
)
opdb_db.add_web_page_snapshots([snapshot])
# Reuse the other test; web_cache.Session should simply ignore the outdated snapshot
test_get__uncached__expired_wb(configured_requests_mocker, opdb_db)
def test_get__expired_cache__no_wb(configured_requests_mocker, opdb_db: Db):
"""
Tests getting a snapshot that is expired in the local cache, and absent from
the Wayback Machine -> uses Save Page Now
"""
configured_requests_mocker.register_uri(
"GET",
"https://archive.org/wayback/available?url=http%3A%2F%2Fexample.org%2F",
complete_qs=True,
json={"url": "http://example.org/", "archived_snapshots": {}},
)
# Reuse the other test; web_cache.Session should treat the absence of a page
# exactly the same way as an expired one.
test_get__uncached__expired_wb(configured_requests_mocker, opdb_db)

View File

@ -5,16 +5,20 @@ build-backend = "setuptools.build_meta"
[project]
name = "opdb"
version = "0.0.1"
requires-python = ">=3.9"
dependencies = [
"luigi == 3",
"psycopg == 3",
"requests == 2",
"luigi == 3.*",
"psycopg == 3.*",
"requests == 2.*",
]
[project.optional-dependencies]
testing = [
"pytest",
"pytest-postgresql",
"requests-mock",
"types-requests",
"types-setuptools",
]
[tool.isort]
@ -23,15 +27,22 @@ profile = "black"
[tool.mypy]
python_version = "3.9"
[[tool.mypy.overrides]]
module = [
"requests_mock",
]
ignore_missing_imports = true
[tool.pylint.format]
max-line-length = "88"
py-version = "3.9"
disable = [
# too annoying:
"no-else-return",
"too-many-instance-attributes",
"invalid-name",
"fixme",
"invalid-name",
"no-else-return",
"too-few-public-methods",
"too-many-instance-attributes",
# mypy does it better:
"no-member",
"import-error",