Compare commits

...

13 Commits
ci ... main

Author SHA1 Message Date
Val Lorentz abd9edf336 web_cache: Fix SPN error handling
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful
2022-09-25 11:22:41 +02:00
Val Lorentz 1eb169ee6c Add optional support for the authenticated SPN API
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful
It has much higher rate limits.
2022-09-25 11:18:13 +02:00
Val Lorentz 1db60954d6 Save pages returned by Save Page Now
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful
This is especially useful as pages freshly archived by SPN are not
immediately shown as available by the Wayback Machine API, so we
may send a pointless SPN request right after
2022-09-24 22:43:34 +02:00
Val Lorentz 8139bf5410 Fix crash in Save Page Now, when run without mocks 2022-09-24 22:37:22 +02:00
Val Lorentz 9782ec22ec Add a CLI to initialize the database. 2022-09-24 22:30:25 +02:00
Val Lorentz 6f022e5f05 web_cache: Initialize module
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful
With support for fetching from the IA and caching in the local DB.
2022-09-20 21:41:07 +02:00
Val Lorentz 5f3e9d6225 db: Add tests for get_last_web_page_snapshot. 2022-09-20 21:31:30 +02:00
Val Lorentz 13ca40eaf7 orm: Refactor INDEX generation
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful
2022-09-20 21:31:10 +02:00
Val Lorentz 826c6f73f1 orm: Add SELECT generation
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful
2022-09-20 21:28:34 +02:00
Val Lorentz 8abf869c13 Add column 'retrieved_by' to web_page_snapshot
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful
2022-09-18 19:41:26 +02:00
Val Lorentz 95fcb043e8 Initialize with a simple ORM and database to store webpage snapshots
All checks were successful
ci/woodpecker/pr/woodpecker Pipeline was successful
ci/woodpecker/push/woodpecker Pipeline was successful
2022-09-17 23:58:57 +02:00
Val Lorentz 671112566f add more linting, give up on py3.7
All checks were successful
ci/woodpecker/pr/woodpecker Pipeline was successful
ci/woodpecker/push/woodpecker Pipeline was successful
2022-09-17 23:40:59 +02:00
val efe3aaccd4 Initialize boilerplate (#1)
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful
Co-authored-by: Val Lorentz <progval+opdb@progval.net>
Reviewed-on: #1
2022-09-14 16:33:41 +00:00
16 changed files with 1340 additions and 0 deletions

34
.woodpecker.yml Normal file
View File

@ -0,0 +1,34 @@
pipeline:
flake8:
group: lint
image: python:3.9
commands:
- pip3 install flake8
- flake8 opdb/
pylint:
group: lint
image: cytopia/pylint
commands:
- pylint opdb/
black:
group: lint
image: pyfound/black:22.8.0
commands:
- black --check opdb/
test-py3.9:
group: test
image: python:3.9
commands: &test_commands
- apt-get update
- apt-get install -y postgresql
- pip3 install mypy .[testing]
- make mypy
- adduser pytest
# pytest-postgresql runs pg_ctl, which refuses to run as root
- su pytest -c 'make pytest'
test-py3.10:
group: test
image: python:3.10
commands: *test_commands

31
Makefile Normal file
View File

@ -0,0 +1,31 @@
all: test
test: lint pytest
lint: black flake8 pylint isort mypy
black-check:
black --check opdb/
black:
black opdb/
flake8: black
flake8 opdb/
pylint:
pylint opdb/
isort-check:
isort --check-only opdb/
isort:
isort opdb/
mypy:
mypy --show-error-codes opdb
pytest:
pytest --doctest-modules
.PHONY: black black-check isort isort-check mypy pytest test

0
opdb/__init__.py Normal file
View File

51
opdb/__main__.py Normal file
View File

@ -0,0 +1,51 @@
# This file is part of the Open Parts Database software
# Copyright (C) 2022 Valentin Lorentz
#
# This program is free software: you can redistribute it and/or modify it under the
# terms of the GNU Affero General Public License version 3, as published by the
# Free Software Foundation.
#
# This program is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
# PARTICULAR PURPOSE. See the GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License along with
# this program. If not, see <http://www.gnu.org/licenses/>.
"""
CLI entrypoint.
"""
import sys
import typing
def error(msg: str) -> typing.NoReturn:
"""Prints the message on stderr and exits with code 1."""
print(msg, file=sys.stderr)
sys.exit(1)
def main() -> None:
"""CLI entrypoint"""
try:
(executable, subcommand, *args) = sys.argv
except ValueError:
error(f"Syntax: {sys.argv[0]} <subcommand> [<arg1> [<arg2> [...]]]")
if subcommand == "initdb":
from opdb.db import Db # pylint: disable=import-outside-toplevel
try:
(dsn,) = args
except ValueError:
error(f"Syntax: {executable} initdb <libpq DSN>")
with Db.open(dsn) as db:
db.init()
else:
error(f"Unknown subcommand: {subcommand}")
if __name__ == "__main__":
main()

40
opdb/conftest.py Normal file
View File

@ -0,0 +1,40 @@
# This file is part of the Open Parts Database software
# Copyright (C) 2022 Valentin Lorentz
#
# This program is free software: you can redistribute it and/or modify it under the
# terms of the GNU Affero General Public License version 3, as published by the
# Free Software Foundation.
#
# This program is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
# PARTICULAR PURPOSE. See the GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License along with
# this program. If not, see <http://www.gnu.org/licenses/>.
"""
pytest fixtures
"""
import pytest
from opdb.db import Db
def iter_subclasses(cls):
"""
Recursively yields all subclasses of a class.
"""
yield cls
for subcls in cls.__subclasses__():
yield from iter_subclasses(subcls)
@pytest.fixture
def opdb_db(postgresql) -> Db:
"""
pytest fixture which yields an empty initialized OPDB database.
"""
db = Db(postgresql)
db.init()
return db

19
opdb/db/__init__.py Normal file
View File

@ -0,0 +1,19 @@
# This file is part of the Open Parts Database software
# Copyright (C) 2022 Valentin Lorentz
#
# This program is free software: you can redistribute it and/or modify it under the
# terms of the GNU Affero General Public License version 3, as published by the
# Free Software Foundation.
#
# This program is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
# PARTICULAR PURPOSE. See the GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License along with
# this program. If not, see <http://www.gnu.org/licenses/>.
"""
Database management
"""
from .db import Db # noqa

74
opdb/db/db.py Normal file
View File

@ -0,0 +1,74 @@
# This file is part of the Open Parts Database software
# Copyright (C) 2022 Valentin Lorentz
#
# This program is free software: you can redistribute it and/or modify it under the
# terms of the GNU Affero General Public License version 3, as published by the
# Free Software Foundation.
#
# This program is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
# PARTICULAR PURPOSE. See the GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License along with
# this program. If not, see <http://www.gnu.org/licenses/>.
"""
Abstraction over the postgresql database used by OPDB
"""
from __future__ import annotations
import contextlib
import typing
import psycopg
from . import models
class Db:
"""
Abstraction over the postgresql database used by OPDB
"""
def __init__(self, conn: psycopg.Connection):
self.conn = conn
@classmethod
@contextlib.contextmanager
def open(cls, dsn: str) -> typing.Iterator[Db]:
"""
Context manager, which yields a :class:`Db` object given a libpq connection
string (DSN)
"""
with psycopg.connect(dsn) as conn:
yield Db(conn)
def init(self) -> None:
"""
Initializes the schema for the connected database.
"""
with self.conn.cursor() as cur:
for name in dir(models):
cls = getattr(models, name)
if hasattr(cls, "TABLE"):
cur.execute(cls.db_schema())
def get_last_web_page_snapshot(
self, url: str
) -> typing.Optional[models.WebPageSnapshot]:
"""
Returns the last snapshot of the given IRI.
"""
snapshots = models.WebPageSnapshot.select(
self.conn, "WHERE url=%s ORDER BY snapshot_date DESC LIMIT 1", (url,)
)
return next(snapshots, None)
def add_web_page_snapshots(
self, snapshots: typing.Iterable[models.WebPageSnapshot]
) -> None:
"""
Stores new snapshots of web pages to the database.
"""
models.WebPageSnapshot.copy_to_db(self.conn, snapshots)

70
opdb/db/db_test.py Normal file
View File

@ -0,0 +1,70 @@
# This file is part of the Open Parts Database software
# Copyright (C) 2022 Valentin Lorentz
#
# This program is free software: you can redistribute it and/or modify it under the
# terms of the GNU Affero General Public License version 3, as published by the
# Free Software Foundation.
#
# This program is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
# PARTICULAR PURPOSE. See the GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License along with
# this program. If not, see <http://www.gnu.org/licenses/>.
"""
Tests basic insertion and retrieval functions
"""
import datetime
import random
from opdb.db import Db, models
def test_missing_web_page_snapshot(opdb_db: Db):
"""Tests retrieving a missing web page returns None."""
assert opdb_db.get_last_web_page_snapshot("http://nonexistent.org") is None
def test_add_web_page_snapshot(opdb_db: Db):
"""Tests adding a web page and that it can be retrieved."""
date = datetime.datetime.now(tz=datetime.timezone.utc)
snapshots = [
models.WebPageSnapshot(
url=f"http://example.org/{i}",
snapshot_date=datetime.datetime.now(tz=datetime.timezone.utc),
snapshot_url=None,
retrieved_at=date,
retrieved_by="localhost",
response_headers={"Content-Length": "7"},
content=f"snapshot {i}".encode(),
)
for i in range(100)
]
opdb_db.add_web_page_snapshots(snapshots)
assert opdb_db.get_last_web_page_snapshot("http://example.org/10") == snapshots[10]
def test_get_last_web_page_snapshot(opdb_db: Db):
"""Tests adding a web page and that it can be retrieved."""
date = datetime.datetime.now(tz=datetime.timezone.utc)
snapshots = [
models.WebPageSnapshot(
url="http://example.org/",
snapshot_date=datetime.datetime.now(tz=datetime.timezone.utc),
snapshot_url=None,
retrieved_at=date,
retrieved_by="localhost",
response_headers={"Content-Length": "7"},
content=f"snapshot {i}".encode(),
)
for i in range(100)
]
last_snapshot = snapshots[-1]
random.shuffle(snapshots)
opdb_db.add_web_page_snapshots(snapshots)
assert opdb_db.get_last_web_page_snapshot("http://example.org/") == last_snapshot

55
opdb/db/models.py Normal file
View File

@ -0,0 +1,55 @@
# This file is part of the Open Parts Database software
# Copyright (C) 2022 Valentin Lorentz
#
# This program is free software: you can redistribute it and/or modify it under the
# terms of the GNU Affero General Public License version 3, as published by the
# Free Software Foundation.
#
# This program is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
# PARTICULAR PURPOSE. See the GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License along with
# this program. If not, see <http://www.gnu.org/licenses/>.
"""
Classes representing objects in the postgresql database
"""
import dataclasses
import datetime
import typing
from .orm import BaseModel as _BaseModel
@dataclasses.dataclass(frozen=True)
class WebPageSnapshot(_BaseModel):
"""Local cache of a live webpage"""
TABLE = "web_page_snapshot"
PK = ("url", "snapshot_date")
url: str
"""IRI of the page"""
snapshot_date: datetime.datetime
"""Moment the snapshot was taken from the live website"""
snapshot_url: typing.Optional[str]
"""IRI where the page was downloaded from (:const:`None` unless the snapshot
was downloaded from a proxy)."""
retrieved_at: datetime.datetime
"""Moment the snapshot was downloaded by opdb and inserted in the DB (differs from
:attr:`snapshot_date` if the snapshot was taken by a proxy)."""
retrieved_by: str
"""Unique string identifying the worker that downloaded the snapshot."""
# TODO: define its format
response_headers: dict[str, str]
"""Response headers of the webpage"""
content: bytes
"""Content of the webpage."""

51
opdb/db/models_test.py Normal file
View File

@ -0,0 +1,51 @@
# This file is part of the Open Parts Database software
# Copyright (C) 2022 Valentin Lorentz
#
# This program is free software: you can redistribute it and/or modify it under the
# terms of the GNU Affero General Public License version 3, as published by the
# Free Software Foundation.
#
# This program is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
# PARTICULAR PURPOSE. See the GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License along with
# this program. If not, see <http://www.gnu.org/licenses/>.
"""
Tests the ORM
"""
import datetime
import pytest
from opdb.db import models
def test_naive_datetime():
"""Tests using a naive datetime as attribute of a model raises an error."""
tz_date = datetime.datetime.now(tz=datetime.timezone.utc)
naive_date = datetime.datetime.now()
with pytest.raises(TypeError, match="timezone-aware datetime"):
models.WebPageSnapshot(
url="http://example.org/",
snapshot_date=tz_date,
snapshot_url=None,
retrieved_at=naive_date,
retrieved_by="localhost",
response_headers={"Content-Length": b"7"},
content=b"foo bar",
)
with pytest.raises(TypeError, match="timezone-aware datetime"):
models.WebPageSnapshot(
url="http://example.org/",
snapshot_date=naive_date,
snapshot_url=None,
retrieved_at=tz_date,
retrieved_by="localhost",
response_headers={"Content-Length": b"7"},
content=b"foo bar",
)

172
opdb/db/orm.py Normal file
View File

@ -0,0 +1,172 @@
# This file is part of the Open Parts Database software
# Copyright (C) 2022 Valentin Lorentz
#
# This program is free software: you can redistribute it and/or modify it under the
# terms of the GNU Affero General Public License version 3, as published by the
# Free Software Foundation.
#
# This program is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
# PARTICULAR PURPOSE. See the GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License along with
# this program. If not, see <http://www.gnu.org/licenses/>.
"""
A minimalist ORM
Features:
* generates postgresql schemas
* provides easy access to postgresql's COPY TO (even for jsonb columns)
* checks :cls:`datetime.datetime` objects are timezone-aware.
"""
import dataclasses
import datetime
import json
import textwrap
import typing
import psycopg
_TSelf = typing.TypeVar("_TSelf", bound="BaseModel")
_TYPE_TO_SQL = {
datetime.datetime: "timestamptz",
str: "text",
bytes: "bytea",
dict: "jsonb",
}
def _type_to_sql(type_: type, *, nullable=False) -> str:
origin = getattr(type_, "__origin__", None)
if origin is typing.Union:
variants = type_.__args__ # type: ignore[attr-defined]
non_none_variants = [
variant for variant in variants if not issubclass(variant, type(None))
]
if len(variants) != 2:
raise TypeError(
f"Unsupported type: {type_} (expected exactly 2 variants, "
f"got {variants!r})"
)
if len(non_none_variants) != 1:
raise TypeError(
f"Unsupported type: {type_} (expected exactly 1 non-None variant, "
f"got {non_none_variants!r})"
)
(inner_type,) = non_none_variants
# type is Optional[inner_type]
return _type_to_sql(inner_type, nullable=True)
elif origin is not None:
# another generic type; simply ignore its __args__
return _type_to_sql(origin)
else:
sql_type = _TYPE_TO_SQL[type_]
if not nullable:
sql_type += " NOT NULL"
return sql_type
class BaseModel:
"""
Base class for all model classes, which provides class methods to generate
DB schema and efficiently insert instances.
"""
TABLE: str
"""Name of the SQL table."""
PK: tuple[str, ...]
"""Primary key of the SQL table."""
__DATETIME_FIELD_NAMES: list[str]
__JSON_FIELD_NAMES: list[str]
def __init_subclass__(cls, *args, **kwargs):
"""
Precomputes ``__DATETIME_FIELD_NAMES`` and ``__JSON_FIELD_NAMES`` on
class initialization, so ``__post_init__`` and ``copy_to_db`` do not need
to run the whole introspection machinery every time.
"""
super().__init_subclass__(*args, **kwargs)
cls.__DATETIME_FIELD_NAMES = []
cls.__JSON_FIELD_NAMES = []
for (field_name, field_type) in cls.__annotations__.items():
if isinstance(field_type, type):
origin = getattr(field_type, "__origin__", None)
args = getattr(field_type, "__args__", None)
if issubclass(field_type, datetime.datetime) or (
origin is typing.Union and datetime.datetime in args
):
cls.__DATETIME_FIELD_NAMES.append(field_name)
if issubclass(field_type, dict) or (
origin is not None and issubclass(origin, dict)
):
cls.__JSON_FIELD_NAMES.append(field_name)
return cls
def __post_init__(self):
"""
Errors if any of the fields is a naive datetime.
"""
for field_name in self.__DATETIME_FIELD_NAMES:
if getattr(self, field_name).tzinfo is None:
raise TypeError(f"{field_name} must be a timezone-aware datetime.")
@classmethod
def copy_to_db(
cls: type[_TSelf], conn: psycopg.Connection, objects: typing.Iterable[_TSelf]
) -> None:
"""
Takes a postgresql connection and an iterable of instances,
and inserts all the instances efficiently in postgresql.
"""
cols = [field.name for field in dataclasses.fields(cls)]
with conn.cursor() as cur:
with cur.copy(f"COPY {cls.TABLE} ({', '.join(cols)}) FROM STDIN") as copy:
for obj in objects:
row = tuple(
json.dumps(getattr(obj, col))
if col in cls.__JSON_FIELD_NAMES
else getattr(obj, col)
for col in cols
)
copy.write_row(row)
@classmethod
def select(
cls: type[_TSelf], conn: psycopg.Connection, clauses: str, params: tuple
) -> typing.Iterator[_TSelf]:
"""
Selects objects from the corresponding table and yields instances of this class.
"""
with conn.cursor(row_factory=psycopg.rows.class_row(cls)) as cur:
cur.execute(f"SELECT * FROM {cls.TABLE} {clauses}", params)
yield from cur
@classmethod
def db_schema(cls) -> str:
"""
Returns SQL code suitable to initialize a table to store instances
of this class.
"""
cols = ",\n ".join(
f"{field.name} {_type_to_sql(field.type)}"
for field in dataclasses.fields(cls)
)
return textwrap.dedent(
f"""\
CREATE TABLE IF NOT EXISTS {cls.TABLE} (
{cols}
);
CREATE UNIQUE INDEX IF NOT EXISTS {cls.TABLE}_pk ON {cls.TABLE} (
{', '.join(cls.PK)}
);
"""
)

41
opdb/db/orm_test.py Normal file
View File

@ -0,0 +1,41 @@
# This file is part of the Open Parts Database software
# Copyright (C) 2022 Valentin Lorentz
#
# This program is free software: you can redistribute it and/or modify it under the
# terms of the GNU Affero General Public License version 3, as published by the
# Free Software Foundation.
#
# This program is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
# PARTICULAR PURPOSE. See the GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License along with
# this program. If not, see <http://www.gnu.org/licenses/>.
"""
Tests the ORM
"""
import textwrap
from opdb.db import models
def test_db_schema():
"""Tests generation of the DB schema for WebPageSnapshot."""
assert models.WebPageSnapshot.db_schema() == textwrap.dedent(
"""\
CREATE TABLE IF NOT EXISTS web_page_snapshot (
url text NOT NULL,
snapshot_date timestamptz NOT NULL,
snapshot_url text,
retrieved_at timestamptz NOT NULL,
retrieved_by text NOT NULL,
response_headers jsonb NOT NULL,
content bytea NOT NULL
);
CREATE UNIQUE INDEX IF NOT EXISTS web_page_snapshot_pk ON web_page_snapshot (
url, snapshot_date
);
"""
)

245
opdb/web_cache.py Normal file
View File

@ -0,0 +1,245 @@
# This file is part of the Open Parts Database software
# Copyright (C) 2022 Valentin Lorentz
#
# This program is free software: you can redistribute it and/or modify it under the
# terms of the GNU Affero General Public License version 3, as published by the
# Free Software Foundation.
#
# This program is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
# PARTICULAR PURPOSE. See the GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License along with
# this program. If not, see <http://www.gnu.org/licenses/>.
"""
Management of the cache of external pages.
This package fetches web pages, archives them in the Internet Archive for future
citation, and caches them in the local database for quick access by other workers.
"""
import datetime
import re
import socket
import time
import typing
import pkg_resources
import requests
from opdb.db import Db, models
_OPDB_VERSION = pkg_resources.require("opdb")[0].version
USER_AGENT = (
f"OPDB/{_OPDB_VERSION} (Open Parts Database cacher; +https://git.tf/opdb/opdb)"
)
_wayback_url_re = re.compile(
r"^https?://web\.archive\.org/web/(?P<timestamp>[0-9]{14})/(?P<origin_url>.+)$"
)
def _datetime_from_ia_timestamp(ia_timestamp: str) -> datetime.datetime:
"""
>>> _datetime_from_ia_timestamp("20220919233014")
datetime.datetime(2022, 9, 19, 23, 30, 14, tzinfo=datetime.timezone.utc)
"""
dt = datetime.datetime.strptime(ia_timestamp, "%Y%m%d%H%M%S")
# Assume it's UTC (neither the Wayback API nor the documentation mention
# timezones)
return dt.replace(tzinfo=datetime.timezone.utc)
class Session:
"""
Wrapper for :class:`requests.Session`, which tries to use pages cached locally in
the postgresql database, and falls back to downloading; making sure they are
archived in the Internet Archive.
"""
def __init__(
self, db: Db, min_snapshot_date: datetime.datetime, ias3_auth: str = None
):
self.min_snapshot_date = min_snapshot_date
self._db = db
self._session = requests.Session()
self._session.headers["User-Agent"] = USER_AGENT
self._ias3_auth = ias3_auth
def _fetch_newest_wayback_snapshot(
self, url: str
) -> typing.Optional[models.WebPageSnapshot]:
"""
If the URL is already archived in the Internet Archive (and newer than
configured with ``min_snapshot_date``), retrieves the latest snapshot available
via the Wayback Machine and returns it.
"""
# API documentation: https://archive.org/help/wayback_api.php
response = self._session.get(
"https://archive.org/wayback/available", params={"url": url}
)
response.raise_for_status() # TODO: retry
newest_ia_snapshot = (
response.json().get("archived_snapshots", {}).get("closest", {})
)
if not newest_ia_snapshot:
return None
ia_timestamp = newest_ia_snapshot["timestamp"]
snapshot_date = _datetime_from_ia_timestamp(ia_timestamp)
if snapshot_date < self.min_snapshot_date:
return None
wayback_url = newest_ia_snapshot["url"]
m = _wayback_url_re.match(wayback_url)
assert m, f"Unexpected Wayback URL format: {wayback_url}"
assert ia_timestamp == m.group(
"timestamp"
), "Timestamp unexpectedly missing from snapshot URL: {wayback_url}"
return self._fetch_wayback_snapshot(url, wayback_url)
def _fetch_wayback_snapshot(
self, url: str, wayback_url: str
) -> models.WebPageSnapshot:
# Add "id_" after the timestamp in the Wayback URL; it allows fetching the
# original page without the navigation header added by the Wayback Machine.
# Documented at https://archive.org/post/1044859/
m = _wayback_url_re.match(wayback_url)
assert m, f"Unexpected Wayback URL format: {wayback_url}"
ia_timestamp = m.group("timestamp")
snapshot_url = wayback_url.replace(ia_timestamp, ia_timestamp + "id_", 1)
response = self._session.get(snapshot_url)
response.raise_for_status() # TODO: retry
return models.WebPageSnapshot(
url=url,
snapshot_date=_datetime_from_ia_timestamp(ia_timestamp),
snapshot_url=snapshot_url,
retrieved_at=datetime.datetime.now(tz=datetime.timezone.utc),
retrieved_by=socket.getfqdn(),
response_headers=dict(response.headers),
content=response.content,
)
def _save_page_now(self, url: str) -> typing.Optional[models.WebPageSnapshot]:
if self._ias3_auth is None:
return self._anonymous_save_page_now(url)
else:
return self._authenticated_save_page_now(url, self._ias3_auth)
def _anonymous_save_page_now(
self, url: str
) -> typing.Optional[models.WebPageSnapshot]:
while True:
try:
response = self._session.get(
f"https://web.archive.org/save/{url}",
allow_redirects=False,
)
response.raise_for_status()
except requests.exceptions.HTTPError as e:
if e.response.status_code == 429:
print(e)
print("Sleeping...")
time.sleep(10)
continue
elif e.response.status_code == 520:
# "Job failed". We will try again in the next workflow run.
return None
else:
raise
else:
wayback_url = response.headers["Location"]
return self._fetch_wayback_snapshot(url, wayback_url)
def _authenticated_save_page_now(
self, url: str, ias3_auth: str
) -> typing.Optional[models.WebPageSnapshot]:
for _ in range(3):
response = self._session.post(
"https://web.archive.org/save/",
allow_redirects=False,
data={"url": url},
headers={
"Accept": "application/json",
"Authorization": f"LOW {ias3_auth}",
},
)
response.raise_for_status()
if response.json().get("status_ext") == "error:too-many-daily-captures":
# typically happens when a page repeatedly fails so we
# (unsuccessfully) tried to capture it too many times
return None
job_id = response.json()["job_id"]
status = "pending"
while status == "pending":
time.sleep(5)
response = self._session.get(
f"https://web.archive.org/save/status/{job_id}"
)
response.raise_for_status()
status = response.json()["status"]
if status == "success":
break
if response.json()["status"] == "error":
print(response.json()["message"])
time.sleep(10)
continue # retry
assert False, response.json()
else:
print("Too many failures; giving up.")
return None
ia_timestamp = response.json()["timestamp"]
snapshot_date = _datetime_from_ia_timestamp(ia_timestamp)
assert snapshot_date >= self.min_snapshot_date, (
snapshot_date.isoformat(),
self.min_snapshot_date.isoformat(),
)
wayback_url = f"https://web.archive.org/web/{ia_timestamp}/{url}"
return self._fetch_wayback_snapshot(url, wayback_url)
def _get_cached_snapshot(self, url: str) -> typing.Optional[models.WebPageSnapshot]:
snapshot = self._db.get_last_web_page_snapshot(url)
if snapshot is None or snapshot.snapshot_date < self.min_snapshot_date:
return None
return snapshot
def get_or_fetch_snapshot(
self, url: str
) -> typing.Optional[models.WebPageSnapshot]:
"""
Fetches the given URL from the local cache or from the Wayback Machine.
Requests archival by the Internet Archive if the Wayback Machine does not
already have the page available.
"""
# First, try the local cache
snapshot = self._get_cached_snapshot(url)
if snapshot is not None:
return snapshot
# Then, try fetching from the Wayback Machine (and cache it locally)
snapshot = self._fetch_newest_wayback_snapshot(url)
if snapshot is not None:
self._db.add_web_page_snapshots([snapshot])
return snapshot
# If the Internet Archive does not have it yet, trigger its Save Code Now,
# and query the Wayback Machine again
snapshot = self._save_page_now(url)
if snapshot is not None:
self._db.add_web_page_snapshots([snapshot])
return snapshot

395
opdb/web_cache_test.py Normal file
View File

@ -0,0 +1,395 @@
# This file is part of the Open Parts Database software
# Copyright (C) 2022 Valentin Lorentz
#
# This program is free software: you can redistribute it and/or modify it under the
# terms of the GNU Affero General Public License version 3, as published by the
# Free Software Foundation.
#
# This program is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
# PARTICULAR PURPOSE. See the GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License along with
# this program. If not, see <http://www.gnu.org/licenses/>.
# pylint: disable=redefined-outer-name
"""
Test generic web page retrieval and caching
"""
import datetime
import socket
import pytest
import requests_mock
from opdb.db import Db, models
from opdb.web_cache import Session
@pytest.fixture
def requests_mocker():
"""Fixture wrapper for :mod:`requests_mock`"""
with requests_mock.mock() as m:
yield m
SNAPSHOT_DATE = datetime.datetime(2022, 9, 20, 1, 49, 34, tzinfo=datetime.timezone.utc)
@pytest.fixture
def mock_time(mocker):
"""
Makes time.sleep return immediately, and returns a callable that returns the
total offset introduced by time.sleep calls.
"""
offset = 0
def sleep(seconds):
nonlocal offset
offset += seconds
mocker.patch("time.sleep", side_effect=sleep)
def get_offset():
return offset
return get_offset
@pytest.fixture
def configured_requests_mocker(requests_mocker):
"""Extension of :func:`requests_mocker` that registers a bunch of URLs of
the Wayback Machine API."""
requests_mocker.register_uri(
"GET",
"https://archive.org/wayback/available?url=http%3A%2F%2Fexample.org%2F",
complete_qs=True,
json={
"url": "http://example.org/",
"archived_snapshots": {
"closest": {
"status": "200",
"available": True,
"url": (
"http://web.archive.org/web/20220920014934/"
"http://john.smith@example.org/"
),
"timestamp": "20220920014934",
}
},
},
)
requests_mocker.register_uri(
"GET",
"http://web.archive.org/web/20220920014934id_/http://john.smith@example.org/",
complete_qs=True,
text="Example page content from Wayback Machine",
)
requests_mocker.register_uri(
"GET",
"https://web.archive.org/web/20220920164222id_/http://example.org/",
complete_qs=True,
text="Example page content from Wayback Machine after Save Page Now",
)
requests_mocker.register_uri(
"GET",
"https://web.archive.org/web/20220922000000id_/http://example.org/",
complete_qs=True,
text="Example page content from Wayback Machine after Save Page Now",
)
# Anonymous SPN:
requests_mocker.register_uri(
"GET",
"https://web.archive.org/save/http://example.org/",
complete_qs=True,
headers={
"location": "https://web.archive.org/web/20220922000000/http://example.org/"
},
text="""
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
<title>Redirecting...</title>
<h1>Redirecting...</h1>
""",
)
# Authenticated SPN:
requests_mocker.register_uri(
"POST",
"https://web.archive.org/save/",
complete_qs=True,
text='{"url": "http://example.org/", "job_id": "spn2-abcde"}',
)
requests_mocker.register_uri(
"GET",
"https://web.archive.org/save/status/spn2-abcde",
[
dict(
text="""
{"job_id":"spn2-abcde","resources":["blah"],"status":"pending"}
"""
),
dict(
text="""
{"job_id":"spn2-abcde","resources":["blah"],"status":"pending"}
"""
),
dict(
text="""
{"job_id":"spn2-abcde","resources":["blah"],"status":"success",
"timestamp":"20220922000000"}
"""
),
],
complete_qs=True,
)
yield requests_mocker
def test_get__cached(configured_requests_mocker, opdb_db: Db):
"""
Tests getting a snapshot that is already in the local cache
"""
after_date = SNAPSHOT_DATE - datetime.timedelta(days=1)
s = Session(opdb_db, after_date)
retrieved_at = datetime.datetime(2022, 2, 1, 0, 0, 0, tzinfo=datetime.timezone.utc)
snapshot = models.WebPageSnapshot(
url="http://example.org/",
snapshot_date=datetime.datetime.now(tz=datetime.timezone.utc),
snapshot_url=None,
retrieved_at=retrieved_at,
retrieved_by="localhost",
response_headers={"Content-Length": "19"},
content=b"Example page content",
)
opdb_db.add_web_page_snapshots([snapshot])
assert s.get_or_fetch_snapshot("http://example.org/") == snapshot
assert [(r.method, r.url) for r in configured_requests_mocker.request_history] == []
def test_get__uncached__recent_wb(configured_requests_mocker, opdb_db: Db):
"""
Tests getting a snapshot that is not in the local cache, but is available in
the Wayback Machine
"""
after_date = SNAPSHOT_DATE - datetime.timedelta(days=1)
s = Session(opdb_db, after_date)
dt_before = datetime.datetime.now(tz=datetime.timezone.utc)
snapshot = s.get_or_fetch_snapshot("http://example.org/")
dt_after = datetime.datetime.now(tz=datetime.timezone.utc)
assert snapshot is not None
assert dt_before <= snapshot.retrieved_at <= dt_after
assert snapshot == models.WebPageSnapshot(
url="http://example.org/",
snapshot_date=datetime.datetime(
2022, 9, 20, 1, 49, 34, tzinfo=datetime.timezone.utc
),
snapshot_url=(
"http://web.archive.org/web/20220920014934id_/"
"http://john.smith@example.org/"
),
retrieved_at=snapshot.retrieved_at,
retrieved_by=socket.getfqdn(),
response_headers={},
content=b"Example page content from Wayback Machine",
)
assert snapshot == opdb_db.get_last_web_page_snapshot("http://example.org/")
assert [(r.method, r.url) for r in configured_requests_mocker.request_history] == [
(
"GET",
"https://archive.org/wayback/available?url=http%3A%2F%2Fexample.org%2F",
),
(
"GET",
"http://web.archive.org/web/20220920014934id_/"
"http://john.smith@example.org/",
),
]
def test_get__expired_cache__recent_wb(configured_requests_mocker, opdb_db: Db):
"""
Tests getting a snapshot that is expired in the local cache, but is available in
the Wayback Machine
"""
snapshoted_at = SNAPSHOT_DATE - datetime.timedelta(days=2)
retrieved_at = SNAPSHOT_DATE + datetime.timedelta(days=1)
snapshot = models.WebPageSnapshot(
url="http://example.org/",
snapshot_date=snapshoted_at,
snapshot_url=None,
retrieved_at=retrieved_at,
retrieved_by="localhost",
response_headers={"Content-Length": "16"},
content=b"Old page content",
)
opdb_db.add_web_page_snapshots([snapshot])
# Reuse the other test; web_cache.Session should simply ignore the outdated snapshot
test_get__uncached__recent_wb(configured_requests_mocker, opdb_db)
@pytest.fixture(
params=[
pytest.param(False, id="anonymous"),
pytest.param(True, id="authenticated"),
],
)
def authenticated(request) -> bool:
"""Parametrized by False/True"""
return request.param
def test_get__uncached__expired_wb(
configured_requests_mocker, opdb_db: Db, authenticated: bool, mock_time
):
"""
Tests getting a snapshot that is not in the local cache, and expired in
the Wayback Machine -> uses Save Page Now
"""
after_date = SNAPSHOT_DATE + datetime.timedelta(days=1)
if authenticated:
s = Session(opdb_db, after_date, ias3_auth="akey:skey")
else:
s = Session(opdb_db, after_date)
assert mock_time() == 0
dt_before = datetime.datetime.now(tz=datetime.timezone.utc)
snapshot = s.get_or_fetch_snapshot("http://example.org/")
dt_after = datetime.datetime.now(tz=datetime.timezone.utc)
if authenticated:
assert mock_time() == 15 # three time.sleep(5) calls
else:
assert mock_time() == 0 # no sleep() because blocking on SPN GET request
assert snapshot is not None
assert dt_before <= snapshot.retrieved_at <= dt_after
assert snapshot == models.WebPageSnapshot(
url="http://example.org/",
snapshot_date=datetime.datetime(
2022, 9, 22, 0, 0, 0, tzinfo=datetime.timezone.utc
),
snapshot_url=(
"https://web.archive.org/web/20220922000000id_/" # SPN returns HTTPS URLs
"http://example.org/"
),
retrieved_at=snapshot.retrieved_at,
retrieved_by=socket.getfqdn(),
response_headers={},
content=b"Example page content from Wayback Machine after Save Page Now",
)
assert snapshot == opdb_db.get_last_web_page_snapshot("http://example.org/")
if authenticated:
assert [
(r.method, r.url, r.body)
for r in configured_requests_mocker.request_history
] == [
(
"GET",
"https://archive.org/wayback/available?url=http%3A%2F%2Fexample.org%2F",
None,
),
(
"POST",
"https://web.archive.org/save/",
"url=http%3A%2F%2Fexample.org%2F",
),
("GET", "https://web.archive.org/save/status/spn2-abcde", None),
("GET", "https://web.archive.org/save/status/spn2-abcde", None),
("GET", "https://web.archive.org/save/status/spn2-abcde", None),
(
"GET",
"https://web.archive.org/web/20220922000000id_/" # ditto
"http://example.org/",
None,
),
]
else:
assert [
(r.method, r.url) for r in configured_requests_mocker.request_history
] == [
(
"GET",
"https://archive.org/wayback/available?url=http%3A%2F%2Fexample.org%2F",
),
(
"GET",
"https://web.archive.org/save/http://example.org/",
),
(
"GET",
"https://web.archive.org/web/20220922000000id_/" # ditto
"http://example.org/",
),
]
def test_get__expired_cache__expired_wb(
configured_requests_mocker, opdb_db: Db, authenticated: bool, mock_time
):
"""
Tests getting a snapshot that is expired in the local cache, and expired in
the Wayback Machine -> uses Save Page Now
"""
snapshoted_at = SNAPSHOT_DATE - datetime.timedelta(days=2)
retrieved_at = SNAPSHOT_DATE + datetime.timedelta(days=1)
snapshot = models.WebPageSnapshot(
url="http://example.org/",
snapshot_date=snapshoted_at,
snapshot_url=None,
retrieved_at=retrieved_at,
retrieved_by="localhost",
response_headers={"Content-Length": "16"},
content=b"Old page content",
)
opdb_db.add_web_page_snapshots([snapshot])
# Reuse the other test; web_cache.Session should simply ignore the outdated snapshot
test_get__uncached__expired_wb(
configured_requests_mocker, opdb_db, authenticated, mock_time
)
def test_get__expired_cache__no_wb(
configured_requests_mocker, opdb_db: Db, authenticated: bool, mock_time
):
"""
Tests getting a snapshot that is expired in the local cache, and absent from
the Wayback Machine -> uses Save Page Now
"""
configured_requests_mocker.register_uri(
"GET",
"https://archive.org/wayback/available?url=http%3A%2F%2Fexample.org%2F",
complete_qs=True,
json={"url": "http://example.org/", "archived_snapshots": {}},
)
# Reuse the other test; web_cache.Session should treat the absence of a page
# exactly the same way as an expired one.
test_get__uncached__expired_wb(
configured_requests_mocker, opdb_db, authenticated, mock_time
)

59
pyproject.toml Normal file
View File

@ -0,0 +1,59 @@
[build-system]
requires = ["setuptools"]
build-backend = "setuptools.build_meta"
[project]
name = "opdb"
version = "0.0.1"
requires-python = ">=3.9"
dependencies = [
"luigi == 3.*",
"psycopg == 3.*",
"requests == 2.*",
]
[project.optional-dependencies]
testing = [
"pytest",
"pytest-mock",
"pytest-postgresql",
"requests-mock",
"types-requests",
"types-setuptools",
]
[tool.isort]
profile = "black"
[tool.mypy]
python_version = "3.9"
[[tool.mypy.overrides]]
module = [
"requests_mock",
]
ignore_missing_imports = true
[tool.pylint.format]
max-line-length = "88"
py-version = "3.9"
disable = [
# too annoying:
"fixme",
"invalid-name",
"no-else-return",
"no-else-continue",
"too-few-public-methods",
"too-many-instance-attributes",
# false positives:
"unreachable",
"assignment-from-no-return",
# mypy does it better:
"no-member",
"import-error",
# flake8 does it already:
"line-too-long",
]
[tool.pytest.ini_options]
python_files = "*_test.py"

3
setup.cfg Normal file
View File

@ -0,0 +1,3 @@
[flake8]
max-line-length = 88
extend-ignore = E203