Compare commits

...

2 Commits

Author SHA1 Message Date
Val Lorentz 94baeb6516 web_cache: Initialize module
Some checks failed
ci/woodpecker/push/woodpecker Pipeline failed
With support for fetching from the IA and caching in the local DB.
2022-09-20 21:33:55 +02:00
Val Lorentz 5f3e9d6225 db: Add tests for get_last_web_page_snapshot. 2022-09-20 21:31:30 +02:00
6 changed files with 485 additions and 18 deletions

View File

@ -24,6 +24,7 @@ pipeline:
- apt-get update
- apt-get install -y postgresql
- pip3 install mypy .[testing]
- yes | mypy --install-types
- make mypy
- adduser pytest
# pytest-postgresql runs pg_ctl, which refuses to run as root

View File

@ -26,6 +26,6 @@ mypy:
mypy --show-error-codes opdb
pytest:
pytest
pytest --doctest-modules
.PHONY: black black-check isort isort-check mypy pytest test

View File

@ -17,6 +17,7 @@ Tests basic insertion and retrieval functions
"""
import datetime
import random
from opdb.db import Db, models
@ -29,15 +30,41 @@ def test_missing_web_page_snapshot(opdb_db: Db):
def test_add_web_page_snapshot(opdb_db: Db):
"""Tests adding a web page and that it can be retrieved."""
date = datetime.datetime.now(tz=datetime.timezone.utc)
snapshot = models.WebPageSnapshot(
url="http://example.org/",
snapshot_date=datetime.datetime.now(tz=datetime.timezone.utc),
snapshot_url=None,
retrieved_at=date,
retrieved_by="localhost",
response_headers={"Content-Length": "7"},
content=b"foo bar",
)
opdb_db.add_web_page_snapshots([snapshot])
snapshots = [
models.WebPageSnapshot(
url=f"http://example.org/{i}",
snapshot_date=datetime.datetime.now(tz=datetime.timezone.utc),
snapshot_url=None,
retrieved_at=date,
retrieved_by="localhost",
response_headers={"Content-Length": "7"},
content=f"snapshot {i}".encode(),
)
for i in range(100)
]
opdb_db.add_web_page_snapshots(snapshots)
assert opdb_db.get_last_web_page_snapshot("http://example.org/") == snapshot
assert opdb_db.get_last_web_page_snapshot("http://example.org/10") == snapshots[10]
def test_get_last_web_page_snapshot(opdb_db: Db):
"""Tests adding a web page and that it can be retrieved."""
date = datetime.datetime.now(tz=datetime.timezone.utc)
snapshots = [
models.WebPageSnapshot(
url="http://example.org/",
snapshot_date=datetime.datetime.now(tz=datetime.timezone.utc),
snapshot_url=None,
retrieved_at=date,
retrieved_by="localhost",
response_headers={"Content-Length": "7"},
content=f"snapshot {i}".encode(),
)
for i in range(100)
]
last_snapshot = snapshots[-1]
random.shuffle(snapshots)
opdb_db.add_web_page_snapshots(snapshots)
assert opdb_db.get_last_web_page_snapshot("http://example.org/") == last_snapshot

158
opdb/web_cache.py Normal file
View File

@ -0,0 +1,158 @@
# This file is part of the Open Parts Database software
# Copyright (C) 2022 Valentin Lorentz
#
# This program is free software: you can redistribute it and/or modify it under the
# terms of the GNU Affero General Public License version 3, as published by the
# Free Software Foundation.
#
# This program is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
# PARTICULAR PURPOSE. See the GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License along with
# this program. If not, see <http://www.gnu.org/licenses/>.
"""
Management of the cache of external pages.
This package fetches web pages, archives them in the Internet Archive for future
citation, and caches them in the local database for quick access by other workers.
"""
import datetime
import re
import socket
import typing
import pkg_resources
import requests
from opdb.db import Db, models
_OPDB_VERSION = pkg_resources.require("opdb")[0].version
USER_AGENT = (
f"OPDB/{_OPDB_VERSION} (Open Parts Database cacher; +https://git.tf/opdb/opdb)"
)
_wayback_url_re = re.compile(
r"^https?://web\.archive\.org/web/(?P<timestamp>[0-9]{14})/(?P<origin_url>.+)$"
)
def _datetime_from_ia_timestamp(ia_timestamp: str) -> datetime.datetime:
"""
>>> _datetime_from_ia_timestamp("20220919233014")
datetime.datetime(2022, 9, 19, 23, 30, 14, tzinfo=datetime.timezone.utc)
"""
dt = datetime.datetime.strptime(ia_timestamp, "%Y%m%d%H%M%S")
# Assume it's UTC (neither the Wayback API nor the documentation mention
# timezones)
return dt.replace(tzinfo=datetime.timezone.utc)
class Session:
"""
Wrapper for :class:`requests.Session`, which tries to use pages cached locally in
the postgresql database, and falls back to downloading; making sure they are
archived in the Internet Archive.
"""
def __init__(self, db: Db, min_snapshot_date: datetime.datetime):
self.min_snapshot_date = min_snapshot_date
self._db = db
self._session = requests.Session()
self._session.headers["User-Agent"] = USER_AGENT
def _fetch_newest_wayback_snapshot(
self, url: str
) -> typing.Optional[models.WebPageSnapshot]:
"""
If the URL is already archived in the Internet Archive (and newer than
configured with ``min_snapshot_date``), retrieves the latest snapshot available
via the Wayback Machine and returns it.
"""
# API documentation: https://archive.org/help/wayback_api.php
response = self._session.get(
"https://archive.org/wayback/available", params={"url": url}
)
response.raise_for_status() # TODO: retry
newest_ia_snapshot = (
response.json().get("archived_snapshots", {}).get("closest", {})
)
if not newest_ia_snapshot:
return None
ia_timestamp = newest_ia_snapshot["timestamp"]
snapshot_date = _datetime_from_ia_timestamp(ia_timestamp)
if snapshot_date < self.min_snapshot_date:
return None
wayback_url = newest_ia_snapshot["url"]
m = _wayback_url_re.match(wayback_url)
assert m, f"Unexpected Wayback URL format: {wayback_url}"
assert ia_timestamp == m.group(
"timestamp"
), "Timestamp unexpectedly missing from snapshot URL: {wayback_url}"
return self._fetch_wayback_snapshot(url, wayback_url)
def _fetch_wayback_snapshot(
self, url: str, wayback_url: str
) -> models.WebPageSnapshot:
# Add "id_" after the timestamp in the Wayback URL; it allows fetching the
# original page without the navigation header added by the Wayback Machine.
# Documented at https://archive.org/post/1044859/
m = _wayback_url_re.match(wayback_url)
assert m, f"Unexpected Wayback URL format: {wayback_url}"
ia_timestamp = m.group("timestamp")
snapshot_url = wayback_url.replace(ia_timestamp, ia_timestamp + "id_", 1)
response = self._session.get(snapshot_url)
response.raise_for_status() # TODO: retry
return models.WebPageSnapshot(
url=url,
snapshot_date=_datetime_from_ia_timestamp(ia_timestamp),
snapshot_url=snapshot_url,
retrieved_at=datetime.datetime.now(tz=datetime.timezone.utc),
retrieved_by=socket.getfqdn(),
response_headers=dict(response.headers),
content=response.content,
)
def _save_page_now(self, url: str) -> models.WebPageSnapshot:
response = self._session.get(f"https://web.archive.org/save/{url}")
response.raise_for_status() # TODO: retry
wayback_url = response.headers["Location"]
return self._fetch_wayback_snapshot(url, wayback_url)
def _get_cached_snapshot(self, url: str) -> typing.Optional[models.WebPageSnapshot]:
snapshot = self._db.get_last_web_page_snapshot(url)
if snapshot is None or snapshot.snapshot_date < self.min_snapshot_date:
return None
return snapshot
def get_or_fetch_snapshot(self, url: str) -> models.WebPageSnapshot:
"""
Fetches the given URL from the local cache or from the Wayback Machine.
Requests archival by the Internet Archive if the Wayback Machine does not
already have the page available.
"""
# First, try the local cache
snapshot = self._get_cached_snapshot(url)
if snapshot is not None:
return snapshot
# Then, try fetching from the Wayback Machine (and cache it locally)
snapshot = self._fetch_newest_wayback_snapshot(url)
if snapshot is not None:
self._db.add_web_page_snapshots([snapshot])
return snapshot
# If the Internet Archive does not have it yet, trigger its Save Code Now,
# and query the Wayback Machine again
return self._save_page_now(url)

272
opdb/web_cache_test.py Normal file
View File

@ -0,0 +1,272 @@
# This file is part of the Open Parts Database software
# Copyright (C) 2022 Valentin Lorentz
#
# This program is free software: you can redistribute it and/or modify it under the
# terms of the GNU Affero General Public License version 3, as published by the
# Free Software Foundation.
#
# This program is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
# PARTICULAR PURPOSE. See the GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License along with
# this program. If not, see <http://www.gnu.org/licenses/>.
# pylint: disable=redefined-outer-name
"""
Test generic web page retrieval and caching
"""
import datetime
import socket
import pytest
import requests_mock
from opdb.db import Db, models
from opdb.web_cache import Session
@pytest.fixture
def requests_mocker():
"""Fixture wrapper for :mod:`requests_mock`"""
with requests_mock.mock() as m:
yield m
SNAPSHOT_DATE = datetime.datetime(2022, 9, 20, 1, 49, 34, tzinfo=datetime.timezone.utc)
@pytest.fixture
def configured_requests_mocker(requests_mocker):
"""Extension of :func:`requests_mocker` that registers a bunch of URLs of
the Wayback Machine API."""
requests_mocker.register_uri(
"GET",
"https://archive.org/wayback/available?url=http%3A%2F%2Fexample.org%2F",
complete_qs=True,
json={
"url": "http://example.org/",
"archived_snapshots": {
"closest": {
"status": "200",
"available": True,
"url": (
"http://web.archive.org/web/20220920014934/"
"http://john.smith@example.org/"
),
"timestamp": "20220920014934",
}
},
},
)
requests_mocker.register_uri(
"GET",
"http://web.archive.org/web/20220920014934id_/http://john.smith@example.org/",
complete_qs=True,
text="Example page content from Wayback Machine",
)
requests_mocker.register_uri(
"GET",
"https://web.archive.org/web/20220920164222id_/http://example.org/",
complete_qs=True,
text="Example page content from Wayback Machine after Save Page Now",
)
requests_mocker.register_uri(
"GET",
"https://web.archive.org/save/http://example.org/",
complete_qs=True,
headers={
"location": "https://web.archive.org/web/20220920164222/http://example.org/"
},
text="""
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
<title>Redirecting...</title>
<h1>Redirecting...</h1>
""",
)
yield requests_mocker
def test_get__cached(configured_requests_mocker, opdb_db: Db):
"""
Tests getting a snapshot that is already in the local cache
"""
after_date = SNAPSHOT_DATE - datetime.timedelta(days=1)
s = Session(opdb_db, after_date)
retrieved_at = datetime.datetime(2022, 2, 1, 0, 0, 0, tzinfo=datetime.timezone.utc)
snapshot = models.WebPageSnapshot(
url="http://example.org/",
snapshot_date=datetime.datetime.now(tz=datetime.timezone.utc),
snapshot_url=None,
retrieved_at=retrieved_at,
retrieved_by="localhost",
response_headers={"Content-Length": "19"},
content=b"Example page content",
)
opdb_db.add_web_page_snapshots([snapshot])
assert s.get_or_fetch_snapshot("http://example.org/") == snapshot
assert [(r.method, r.url) for r in configured_requests_mocker.request_history] == []
def test_get__uncached__recent_wb(configured_requests_mocker, opdb_db: Db):
"""
Tests getting a snapshot that is not in the local cache, but is available in
the Wayback Machine
"""
after_date = SNAPSHOT_DATE - datetime.timedelta(days=1)
s = Session(opdb_db, after_date)
dt_before = datetime.datetime.now(tz=datetime.timezone.utc)
snapshot = s.get_or_fetch_snapshot("http://example.org/")
dt_after = datetime.datetime.now(tz=datetime.timezone.utc)
assert dt_before <= snapshot.retrieved_at <= dt_after
assert snapshot == models.WebPageSnapshot(
url="http://example.org/",
snapshot_date=datetime.datetime(
2022, 9, 20, 1, 49, 34, tzinfo=datetime.timezone.utc
),
snapshot_url=(
"http://web.archive.org/web/20220920014934id_/"
"http://john.smith@example.org/"
),
retrieved_at=snapshot.retrieved_at,
retrieved_by=socket.getfqdn(),
response_headers={},
content=b"Example page content from Wayback Machine",
)
assert [(r.method, r.url) for r in configured_requests_mocker.request_history] == [
(
"GET",
"https://archive.org/wayback/available?url=http%3A%2F%2Fexample.org%2F",
),
(
"GET",
"http://web.archive.org/web/20220920014934id_/"
"http://john.smith@example.org/",
),
]
def test_get__expired_cache__recent_wb(configured_requests_mocker, opdb_db: Db):
"""
Tests getting a snapshot that is expired in the local cache, but is available in
the Wayback Machine
"""
snapshoted_at = SNAPSHOT_DATE - datetime.timedelta(days=2)
retrieved_at = SNAPSHOT_DATE + datetime.timedelta(days=1)
snapshot = models.WebPageSnapshot(
url="http://example.org/",
snapshot_date=snapshoted_at,
snapshot_url=None,
retrieved_at=retrieved_at,
retrieved_by="localhost",
response_headers={"Content-Length": "16"},
content=b"Old page content",
)
opdb_db.add_web_page_snapshots([snapshot])
# Reuse the other test; web_cache.Session should simply ignore the outdated snapshot
test_get__uncached__recent_wb(configured_requests_mocker, opdb_db)
def test_get__uncached__expired_wb(configured_requests_mocker, opdb_db: Db):
"""
Tests getting a snapshot that is not in the local cache, and expired in
the Wayback Machine -> uses Save Page Now
"""
after_date = SNAPSHOT_DATE + datetime.timedelta(days=1)
s = Session(opdb_db, after_date)
dt_before = datetime.datetime.now(tz=datetime.timezone.utc)
snapshot = s.get_or_fetch_snapshot("http://example.org/")
dt_after = datetime.datetime.now(tz=datetime.timezone.utc)
assert dt_before <= snapshot.retrieved_at <= dt_after
assert snapshot == models.WebPageSnapshot(
url="http://example.org/",
snapshot_date=datetime.datetime(
2022, 9, 20, 16, 42, 22, tzinfo=datetime.timezone.utc
),
snapshot_url=(
"https://web.archive.org/web/20220920164222id_/" # SPN returns HTTPS URLs
"http://example.org/"
),
retrieved_at=snapshot.retrieved_at,
retrieved_by=socket.getfqdn(),
response_headers={},
content=b"Example page content from Wayback Machine after Save Page Now",
)
assert [(r.method, r.url) for r in configured_requests_mocker.request_history] == [
(
"GET",
"https://archive.org/wayback/available?url=http%3A%2F%2Fexample.org%2F",
),
(
"GET",
"https://web.archive.org/save/http://example.org/",
),
(
"GET",
"https://web.archive.org/web/20220920164222id_/" # ditto
"http://example.org/",
),
]
def test_get__expired_cache__expired_wb(configured_requests_mocker, opdb_db: Db):
"""
Tests getting a snapshot that is expired in the local cache, and expired in
the Wayback Machine -> uses Save Page Now
"""
snapshoted_at = SNAPSHOT_DATE - datetime.timedelta(days=2)
retrieved_at = SNAPSHOT_DATE + datetime.timedelta(days=1)
snapshot = models.WebPageSnapshot(
url="http://example.org/",
snapshot_date=snapshoted_at,
snapshot_url=None,
retrieved_at=retrieved_at,
retrieved_by="localhost",
response_headers={"Content-Length": "16"},
content=b"Old page content",
)
opdb_db.add_web_page_snapshots([snapshot])
# Reuse the other test; web_cache.Session should simply ignore the outdated snapshot
test_get__uncached__expired_wb(configured_requests_mocker, opdb_db)
def test_get__expired_cache__no_wb(configured_requests_mocker, opdb_db: Db):
"""
Tests getting a snapshot that is expired in the local cache, and absent from
the Wayback Machine -> uses Save Page Now
"""
configured_requests_mocker.register_uri(
"GET",
"https://archive.org/wayback/available?url=http%3A%2F%2Fexample.org%2F",
complete_qs=True,
json={"url": "http://example.org/", "archived_snapshots": {}},
)
# Reuse the other test; web_cache.Session should treat the absence of a page
# exactly the same way as an expired one.
test_get__uncached__expired_wb(configured_requests_mocker, opdb_db)

View File

@ -5,16 +5,18 @@ build-backend = "setuptools.build_meta"
[project]
name = "opdb"
version = "0.0.1"
requires-python = ">=3.9"
dependencies = [
"luigi == 3",
"psycopg == 3",
"requests == 2",
"luigi == 3.*",
"psycopg == 3.*",
"requests == 2.*",
]
[project.optional-dependencies]
testing = [
"pytest",
"pytest-postgresql",
"requests-mock",
]
[tool.isort]
@ -23,15 +25,22 @@ profile = "black"
[tool.mypy]
python_version = "3.9"
[[tool.mypy.overrides]]
module = [
"requests_mock",
]
ignore_missing_imports = true
[tool.pylint.format]
max-line-length = "88"
py-version = "3.9"
disable = [
# too annoying:
"no-else-return",
"too-many-instance-attributes",
"invalid-name",
"fixme",
"invalid-name",
"no-else-return",
"too-few-public-methods",
"too-many-instance-attributes",
# mypy does it better:
"no-member",
"import-error",