web_cache: Initialize module
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful
With support for fetching from the IA and caching in the local DB.
This commit is contained in:
parent
5f3e9d6225
commit
6f022e5f05
2
Makefile
2
Makefile
|
@ -26,6 +26,6 @@ mypy:
|
|||
mypy --show-error-codes opdb
|
||||
|
||||
pytest:
|
||||
pytest
|
||||
pytest --doctest-modules
|
||||
|
||||
.PHONY: black black-check isort isort-check mypy pytest test
|
||||
|
|
|
@ -0,0 +1,158 @@
|
|||
# This file is part of the Open Parts Database software
|
||||
# Copyright (C) 2022 Valentin Lorentz
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify it under the
|
||||
# terms of the GNU Affero General Public License version 3, as published by the
|
||||
# Free Software Foundation.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
|
||||
# PARTICULAR PURPOSE. See the GNU Affero General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License along with
|
||||
# this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
"""
|
||||
Management of the cache of external pages.
|
||||
|
||||
This package fetches web pages, archives them in the Internet Archive for future
|
||||
citation, and caches them in the local database for quick access by other workers.
|
||||
"""
|
||||
|
||||
import datetime
|
||||
import re
|
||||
import socket
|
||||
import typing
|
||||
|
||||
import pkg_resources
|
||||
import requests
|
||||
|
||||
from opdb.db import Db, models
|
||||
|
||||
_OPDB_VERSION = pkg_resources.require("opdb")[0].version
|
||||
USER_AGENT = (
|
||||
f"OPDB/{_OPDB_VERSION} (Open Parts Database cacher; +https://git.tf/opdb/opdb)"
|
||||
)
|
||||
|
||||
_wayback_url_re = re.compile(
|
||||
r"^https?://web\.archive\.org/web/(?P<timestamp>[0-9]{14})/(?P<origin_url>.+)$"
|
||||
)
|
||||
|
||||
|
||||
def _datetime_from_ia_timestamp(ia_timestamp: str) -> datetime.datetime:
|
||||
"""
|
||||
>>> _datetime_from_ia_timestamp("20220919233014")
|
||||
datetime.datetime(2022, 9, 19, 23, 30, 14, tzinfo=datetime.timezone.utc)
|
||||
"""
|
||||
dt = datetime.datetime.strptime(ia_timestamp, "%Y%m%d%H%M%S")
|
||||
# Assume it's UTC (neither the Wayback API nor the documentation mention
|
||||
# timezones)
|
||||
return dt.replace(tzinfo=datetime.timezone.utc)
|
||||
|
||||
|
||||
class Session:
|
||||
"""
|
||||
Wrapper for :class:`requests.Session`, which tries to use pages cached locally in
|
||||
the postgresql database, and falls back to downloading; making sure they are
|
||||
archived in the Internet Archive.
|
||||
"""
|
||||
|
||||
def __init__(self, db: Db, min_snapshot_date: datetime.datetime):
|
||||
self.min_snapshot_date = min_snapshot_date
|
||||
self._db = db
|
||||
self._session = requests.Session()
|
||||
self._session.headers["User-Agent"] = USER_AGENT
|
||||
|
||||
def _fetch_newest_wayback_snapshot(
|
||||
self, url: str
|
||||
) -> typing.Optional[models.WebPageSnapshot]:
|
||||
"""
|
||||
If the URL is already archived in the Internet Archive (and newer than
|
||||
configured with ``min_snapshot_date``), retrieves the latest snapshot available
|
||||
via the Wayback Machine and returns it.
|
||||
"""
|
||||
# API documentation: https://archive.org/help/wayback_api.php
|
||||
response = self._session.get(
|
||||
"https://archive.org/wayback/available", params={"url": url}
|
||||
)
|
||||
response.raise_for_status() # TODO: retry
|
||||
|
||||
newest_ia_snapshot = (
|
||||
response.json().get("archived_snapshots", {}).get("closest", {})
|
||||
)
|
||||
if not newest_ia_snapshot:
|
||||
return None
|
||||
|
||||
ia_timestamp = newest_ia_snapshot["timestamp"]
|
||||
snapshot_date = _datetime_from_ia_timestamp(ia_timestamp)
|
||||
|
||||
if snapshot_date < self.min_snapshot_date:
|
||||
return None
|
||||
|
||||
wayback_url = newest_ia_snapshot["url"]
|
||||
m = _wayback_url_re.match(wayback_url)
|
||||
assert m, f"Unexpected Wayback URL format: {wayback_url}"
|
||||
assert ia_timestamp == m.group(
|
||||
"timestamp"
|
||||
), "Timestamp unexpectedly missing from snapshot URL: {wayback_url}"
|
||||
|
||||
return self._fetch_wayback_snapshot(url, wayback_url)
|
||||
|
||||
def _fetch_wayback_snapshot(
|
||||
self, url: str, wayback_url: str
|
||||
) -> models.WebPageSnapshot:
|
||||
# Add "id_" after the timestamp in the Wayback URL; it allows fetching the
|
||||
# original page without the navigation header added by the Wayback Machine.
|
||||
# Documented at https://archive.org/post/1044859/
|
||||
m = _wayback_url_re.match(wayback_url)
|
||||
assert m, f"Unexpected Wayback URL format: {wayback_url}"
|
||||
ia_timestamp = m.group("timestamp")
|
||||
snapshot_url = wayback_url.replace(ia_timestamp, ia_timestamp + "id_", 1)
|
||||
|
||||
response = self._session.get(snapshot_url)
|
||||
response.raise_for_status() # TODO: retry
|
||||
|
||||
return models.WebPageSnapshot(
|
||||
url=url,
|
||||
snapshot_date=_datetime_from_ia_timestamp(ia_timestamp),
|
||||
snapshot_url=snapshot_url,
|
||||
retrieved_at=datetime.datetime.now(tz=datetime.timezone.utc),
|
||||
retrieved_by=socket.getfqdn(),
|
||||
response_headers=dict(response.headers),
|
||||
content=response.content,
|
||||
)
|
||||
|
||||
def _save_page_now(self, url: str) -> models.WebPageSnapshot:
|
||||
response = self._session.get(f"https://web.archive.org/save/{url}")
|
||||
response.raise_for_status() # TODO: retry
|
||||
wayback_url = response.headers["Location"]
|
||||
return self._fetch_wayback_snapshot(url, wayback_url)
|
||||
|
||||
def _get_cached_snapshot(self, url: str) -> typing.Optional[models.WebPageSnapshot]:
|
||||
snapshot = self._db.get_last_web_page_snapshot(url)
|
||||
if snapshot is None or snapshot.snapshot_date < self.min_snapshot_date:
|
||||
return None
|
||||
|
||||
return snapshot
|
||||
|
||||
def get_or_fetch_snapshot(self, url: str) -> models.WebPageSnapshot:
|
||||
"""
|
||||
Fetches the given URL from the local cache or from the Wayback Machine.
|
||||
|
||||
Requests archival by the Internet Archive if the Wayback Machine does not
|
||||
already have the page available.
|
||||
"""
|
||||
# First, try the local cache
|
||||
snapshot = self._get_cached_snapshot(url)
|
||||
if snapshot is not None:
|
||||
return snapshot
|
||||
|
||||
# Then, try fetching from the Wayback Machine (and cache it locally)
|
||||
snapshot = self._fetch_newest_wayback_snapshot(url)
|
||||
if snapshot is not None:
|
||||
self._db.add_web_page_snapshots([snapshot])
|
||||
return snapshot
|
||||
|
||||
# If the Internet Archive does not have it yet, trigger its Save Code Now,
|
||||
# and query the Wayback Machine again
|
||||
return self._save_page_now(url)
|
|
@ -0,0 +1,272 @@
|
|||
# This file is part of the Open Parts Database software
|
||||
# Copyright (C) 2022 Valentin Lorentz
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify it under the
|
||||
# terms of the GNU Affero General Public License version 3, as published by the
|
||||
# Free Software Foundation.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
|
||||
# PARTICULAR PURPOSE. See the GNU Affero General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License along with
|
||||
# this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
# pylint: disable=redefined-outer-name
|
||||
|
||||
"""
|
||||
Test generic web page retrieval and caching
|
||||
"""
|
||||
|
||||
import datetime
|
||||
import socket
|
||||
|
||||
import pytest
|
||||
import requests_mock
|
||||
|
||||
from opdb.db import Db, models
|
||||
from opdb.web_cache import Session
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def requests_mocker():
|
||||
"""Fixture wrapper for :mod:`requests_mock`"""
|
||||
with requests_mock.mock() as m:
|
||||
yield m
|
||||
|
||||
|
||||
SNAPSHOT_DATE = datetime.datetime(2022, 9, 20, 1, 49, 34, tzinfo=datetime.timezone.utc)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def configured_requests_mocker(requests_mocker):
|
||||
"""Extension of :func:`requests_mocker` that registers a bunch of URLs of
|
||||
the Wayback Machine API."""
|
||||
requests_mocker.register_uri(
|
||||
"GET",
|
||||
"https://archive.org/wayback/available?url=http%3A%2F%2Fexample.org%2F",
|
||||
complete_qs=True,
|
||||
json={
|
||||
"url": "http://example.org/",
|
||||
"archived_snapshots": {
|
||||
"closest": {
|
||||
"status": "200",
|
||||
"available": True,
|
||||
"url": (
|
||||
"http://web.archive.org/web/20220920014934/"
|
||||
"http://john.smith@example.org/"
|
||||
),
|
||||
"timestamp": "20220920014934",
|
||||
}
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
requests_mocker.register_uri(
|
||||
"GET",
|
||||
"http://web.archive.org/web/20220920014934id_/http://john.smith@example.org/",
|
||||
complete_qs=True,
|
||||
text="Example page content from Wayback Machine",
|
||||
)
|
||||
|
||||
requests_mocker.register_uri(
|
||||
"GET",
|
||||
"https://web.archive.org/web/20220920164222id_/http://example.org/",
|
||||
complete_qs=True,
|
||||
text="Example page content from Wayback Machine after Save Page Now",
|
||||
)
|
||||
|
||||
requests_mocker.register_uri(
|
||||
"GET",
|
||||
"https://web.archive.org/save/http://example.org/",
|
||||
complete_qs=True,
|
||||
headers={
|
||||
"location": "https://web.archive.org/web/20220920164222/http://example.org/"
|
||||
},
|
||||
text="""
|
||||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
|
||||
<title>Redirecting...</title>
|
||||
<h1>Redirecting...</h1>
|
||||
""",
|
||||
)
|
||||
|
||||
yield requests_mocker
|
||||
|
||||
|
||||
def test_get__cached(configured_requests_mocker, opdb_db: Db):
|
||||
"""
|
||||
Tests getting a snapshot that is already in the local cache
|
||||
"""
|
||||
after_date = SNAPSHOT_DATE - datetime.timedelta(days=1)
|
||||
s = Session(opdb_db, after_date)
|
||||
|
||||
retrieved_at = datetime.datetime(2022, 2, 1, 0, 0, 0, tzinfo=datetime.timezone.utc)
|
||||
|
||||
snapshot = models.WebPageSnapshot(
|
||||
url="http://example.org/",
|
||||
snapshot_date=datetime.datetime.now(tz=datetime.timezone.utc),
|
||||
snapshot_url=None,
|
||||
retrieved_at=retrieved_at,
|
||||
retrieved_by="localhost",
|
||||
response_headers={"Content-Length": "19"},
|
||||
content=b"Example page content",
|
||||
)
|
||||
|
||||
opdb_db.add_web_page_snapshots([snapshot])
|
||||
|
||||
assert s.get_or_fetch_snapshot("http://example.org/") == snapshot
|
||||
|
||||
assert [(r.method, r.url) for r in configured_requests_mocker.request_history] == []
|
||||
|
||||
|
||||
def test_get__uncached__recent_wb(configured_requests_mocker, opdb_db: Db):
|
||||
"""
|
||||
Tests getting a snapshot that is not in the local cache, but is available in
|
||||
the Wayback Machine
|
||||
"""
|
||||
after_date = SNAPSHOT_DATE - datetime.timedelta(days=1)
|
||||
s = Session(opdb_db, after_date)
|
||||
|
||||
dt_before = datetime.datetime.now(tz=datetime.timezone.utc)
|
||||
snapshot = s.get_or_fetch_snapshot("http://example.org/")
|
||||
dt_after = datetime.datetime.now(tz=datetime.timezone.utc)
|
||||
|
||||
assert dt_before <= snapshot.retrieved_at <= dt_after
|
||||
|
||||
assert snapshot == models.WebPageSnapshot(
|
||||
url="http://example.org/",
|
||||
snapshot_date=datetime.datetime(
|
||||
2022, 9, 20, 1, 49, 34, tzinfo=datetime.timezone.utc
|
||||
),
|
||||
snapshot_url=(
|
||||
"http://web.archive.org/web/20220920014934id_/"
|
||||
"http://john.smith@example.org/"
|
||||
),
|
||||
retrieved_at=snapshot.retrieved_at,
|
||||
retrieved_by=socket.getfqdn(),
|
||||
response_headers={},
|
||||
content=b"Example page content from Wayback Machine",
|
||||
)
|
||||
|
||||
assert [(r.method, r.url) for r in configured_requests_mocker.request_history] == [
|
||||
(
|
||||
"GET",
|
||||
"https://archive.org/wayback/available?url=http%3A%2F%2Fexample.org%2F",
|
||||
),
|
||||
(
|
||||
"GET",
|
||||
"http://web.archive.org/web/20220920014934id_/"
|
||||
"http://john.smith@example.org/",
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
def test_get__expired_cache__recent_wb(configured_requests_mocker, opdb_db: Db):
|
||||
"""
|
||||
Tests getting a snapshot that is expired in the local cache, but is available in
|
||||
the Wayback Machine
|
||||
"""
|
||||
snapshoted_at = SNAPSHOT_DATE - datetime.timedelta(days=2)
|
||||
retrieved_at = SNAPSHOT_DATE + datetime.timedelta(days=1)
|
||||
|
||||
snapshot = models.WebPageSnapshot(
|
||||
url="http://example.org/",
|
||||
snapshot_date=snapshoted_at,
|
||||
snapshot_url=None,
|
||||
retrieved_at=retrieved_at,
|
||||
retrieved_by="localhost",
|
||||
response_headers={"Content-Length": "16"},
|
||||
content=b"Old page content",
|
||||
)
|
||||
|
||||
opdb_db.add_web_page_snapshots([snapshot])
|
||||
|
||||
# Reuse the other test; web_cache.Session should simply ignore the outdated snapshot
|
||||
test_get__uncached__recent_wb(configured_requests_mocker, opdb_db)
|
||||
|
||||
|
||||
def test_get__uncached__expired_wb(configured_requests_mocker, opdb_db: Db):
|
||||
"""
|
||||
Tests getting a snapshot that is not in the local cache, and expired in
|
||||
the Wayback Machine -> uses Save Page Now
|
||||
"""
|
||||
after_date = SNAPSHOT_DATE + datetime.timedelta(days=1)
|
||||
s = Session(opdb_db, after_date)
|
||||
|
||||
dt_before = datetime.datetime.now(tz=datetime.timezone.utc)
|
||||
snapshot = s.get_or_fetch_snapshot("http://example.org/")
|
||||
dt_after = datetime.datetime.now(tz=datetime.timezone.utc)
|
||||
|
||||
assert dt_before <= snapshot.retrieved_at <= dt_after
|
||||
|
||||
assert snapshot == models.WebPageSnapshot(
|
||||
url="http://example.org/",
|
||||
snapshot_date=datetime.datetime(
|
||||
2022, 9, 20, 16, 42, 22, tzinfo=datetime.timezone.utc
|
||||
),
|
||||
snapshot_url=(
|
||||
"https://web.archive.org/web/20220920164222id_/" # SPN returns HTTPS URLs
|
||||
"http://example.org/"
|
||||
),
|
||||
retrieved_at=snapshot.retrieved_at,
|
||||
retrieved_by=socket.getfqdn(),
|
||||
response_headers={},
|
||||
content=b"Example page content from Wayback Machine after Save Page Now",
|
||||
)
|
||||
|
||||
assert [(r.method, r.url) for r in configured_requests_mocker.request_history] == [
|
||||
(
|
||||
"GET",
|
||||
"https://archive.org/wayback/available?url=http%3A%2F%2Fexample.org%2F",
|
||||
),
|
||||
(
|
||||
"GET",
|
||||
"https://web.archive.org/save/http://example.org/",
|
||||
),
|
||||
(
|
||||
"GET",
|
||||
"https://web.archive.org/web/20220920164222id_/" # ditto
|
||||
"http://example.org/",
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
def test_get__expired_cache__expired_wb(configured_requests_mocker, opdb_db: Db):
|
||||
"""
|
||||
Tests getting a snapshot that is expired in the local cache, and expired in
|
||||
the Wayback Machine -> uses Save Page Now
|
||||
"""
|
||||
snapshoted_at = SNAPSHOT_DATE - datetime.timedelta(days=2)
|
||||
retrieved_at = SNAPSHOT_DATE + datetime.timedelta(days=1)
|
||||
|
||||
snapshot = models.WebPageSnapshot(
|
||||
url="http://example.org/",
|
||||
snapshot_date=snapshoted_at,
|
||||
snapshot_url=None,
|
||||
retrieved_at=retrieved_at,
|
||||
retrieved_by="localhost",
|
||||
response_headers={"Content-Length": "16"},
|
||||
content=b"Old page content",
|
||||
)
|
||||
|
||||
opdb_db.add_web_page_snapshots([snapshot])
|
||||
|
||||
# Reuse the other test; web_cache.Session should simply ignore the outdated snapshot
|
||||
test_get__uncached__expired_wb(configured_requests_mocker, opdb_db)
|
||||
|
||||
|
||||
def test_get__expired_cache__no_wb(configured_requests_mocker, opdb_db: Db):
|
||||
"""
|
||||
Tests getting a snapshot that is expired in the local cache, and absent from
|
||||
the Wayback Machine -> uses Save Page Now
|
||||
"""
|
||||
configured_requests_mocker.register_uri(
|
||||
"GET",
|
||||
"https://archive.org/wayback/available?url=http%3A%2F%2Fexample.org%2F",
|
||||
complete_qs=True,
|
||||
json={"url": "http://example.org/", "archived_snapshots": {}},
|
||||
)
|
||||
|
||||
# Reuse the other test; web_cache.Session should treat the absence of a page
|
||||
# exactly the same way as an expired one.
|
||||
test_get__uncached__expired_wb(configured_requests_mocker, opdb_db)
|
|
@ -5,16 +5,20 @@ build-backend = "setuptools.build_meta"
|
|||
[project]
|
||||
name = "opdb"
|
||||
version = "0.0.1"
|
||||
requires-python = ">=3.9"
|
||||
dependencies = [
|
||||
"luigi == 3",
|
||||
"psycopg == 3",
|
||||
"requests == 2",
|
||||
"luigi == 3.*",
|
||||
"psycopg == 3.*",
|
||||
"requests == 2.*",
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
testing = [
|
||||
"pytest",
|
||||
"pytest-postgresql",
|
||||
"requests-mock",
|
||||
"types-requests",
|
||||
"types-setuptools",
|
||||
]
|
||||
|
||||
[tool.isort]
|
||||
|
@ -23,15 +27,22 @@ profile = "black"
|
|||
[tool.mypy]
|
||||
python_version = "3.9"
|
||||
|
||||
[[tool.mypy.overrides]]
|
||||
module = [
|
||||
"requests_mock",
|
||||
]
|
||||
ignore_missing_imports = true
|
||||
|
||||
[tool.pylint.format]
|
||||
max-line-length = "88"
|
||||
py-version = "3.9"
|
||||
disable = [
|
||||
# too annoying:
|
||||
"no-else-return",
|
||||
"too-many-instance-attributes",
|
||||
"invalid-name",
|
||||
"fixme",
|
||||
"invalid-name",
|
||||
"no-else-return",
|
||||
"too-few-public-methods",
|
||||
"too-many-instance-attributes",
|
||||
# mypy does it better:
|
||||
"no-member",
|
||||
"import-error",
|
||||
|
|
Loading…
Reference in New Issue