Compare commits
13 Commits
Author | SHA1 | Date | |
---|---|---|---|
Val Lorentz | abd9edf336 | ||
Val Lorentz | 1eb169ee6c | ||
Val Lorentz | 1db60954d6 | ||
Val Lorentz | 8139bf5410 | ||
Val Lorentz | 9782ec22ec | ||
Val Lorentz | 6f022e5f05 | ||
Val Lorentz | 5f3e9d6225 | ||
Val Lorentz | 13ca40eaf7 | ||
Val Lorentz | 826c6f73f1 | ||
Val Lorentz | 8abf869c13 | ||
Val Lorentz | 95fcb043e8 | ||
Val Lorentz | 671112566f | ||
val | efe3aaccd4 |
|
@ -0,0 +1,34 @@
|
|||
pipeline:
|
||||
flake8:
|
||||
group: lint
|
||||
image: python:3.9
|
||||
commands:
|
||||
- pip3 install flake8
|
||||
- flake8 opdb/
|
||||
pylint:
|
||||
group: lint
|
||||
image: cytopia/pylint
|
||||
commands:
|
||||
- pylint opdb/
|
||||
|
||||
black:
|
||||
group: lint
|
||||
image: pyfound/black:22.8.0
|
||||
commands:
|
||||
- black --check opdb/
|
||||
|
||||
test-py3.9:
|
||||
group: test
|
||||
image: python:3.9
|
||||
commands: &test_commands
|
||||
- apt-get update
|
||||
- apt-get install -y postgresql
|
||||
- pip3 install mypy .[testing]
|
||||
- make mypy
|
||||
- adduser pytest
|
||||
# pytest-postgresql runs pg_ctl, which refuses to run as root
|
||||
- su pytest -c 'make pytest'
|
||||
test-py3.10:
|
||||
group: test
|
||||
image: python:3.10
|
||||
commands: *test_commands
|
|
@ -0,0 +1,31 @@
|
|||
all: test
|
||||
|
||||
test: lint pytest
|
||||
|
||||
lint: black flake8 pylint isort mypy
|
||||
|
||||
black-check:
|
||||
black --check opdb/
|
||||
|
||||
black:
|
||||
black opdb/
|
||||
|
||||
flake8: black
|
||||
flake8 opdb/
|
||||
|
||||
pylint:
|
||||
pylint opdb/
|
||||
|
||||
isort-check:
|
||||
isort --check-only opdb/
|
||||
|
||||
isort:
|
||||
isort opdb/
|
||||
|
||||
mypy:
|
||||
mypy --show-error-codes opdb
|
||||
|
||||
pytest:
|
||||
pytest --doctest-modules
|
||||
|
||||
.PHONY: black black-check isort isort-check mypy pytest test
|
|
@ -0,0 +1,51 @@
|
|||
# This file is part of the Open Parts Database software
|
||||
# Copyright (C) 2022 Valentin Lorentz
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify it under the
|
||||
# terms of the GNU Affero General Public License version 3, as published by the
|
||||
# Free Software Foundation.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
|
||||
# PARTICULAR PURPOSE. See the GNU Affero General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License along with
|
||||
# this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
"""
|
||||
CLI entrypoint.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import typing
|
||||
|
||||
|
||||
def error(msg: str) -> typing.NoReturn:
|
||||
"""Prints the message on stderr and exits with code 1."""
|
||||
print(msg, file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
"""CLI entrypoint"""
|
||||
try:
|
||||
(executable, subcommand, *args) = sys.argv
|
||||
except ValueError:
|
||||
error(f"Syntax: {sys.argv[0]} <subcommand> [<arg1> [<arg2> [...]]]")
|
||||
|
||||
if subcommand == "initdb":
|
||||
from opdb.db import Db # pylint: disable=import-outside-toplevel
|
||||
|
||||
try:
|
||||
(dsn,) = args
|
||||
except ValueError:
|
||||
error(f"Syntax: {executable} initdb <libpq DSN>")
|
||||
|
||||
with Db.open(dsn) as db:
|
||||
db.init()
|
||||
else:
|
||||
error(f"Unknown subcommand: {subcommand}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -0,0 +1,40 @@
|
|||
# This file is part of the Open Parts Database software
|
||||
# Copyright (C) 2022 Valentin Lorentz
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify it under the
|
||||
# terms of the GNU Affero General Public License version 3, as published by the
|
||||
# Free Software Foundation.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
|
||||
# PARTICULAR PURPOSE. See the GNU Affero General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License along with
|
||||
# this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
"""
|
||||
pytest fixtures
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
||||
from opdb.db import Db
|
||||
|
||||
|
||||
def iter_subclasses(cls):
|
||||
"""
|
||||
Recursively yields all subclasses of a class.
|
||||
"""
|
||||
yield cls
|
||||
for subcls in cls.__subclasses__():
|
||||
yield from iter_subclasses(subcls)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def opdb_db(postgresql) -> Db:
|
||||
"""
|
||||
pytest fixture which yields an empty initialized OPDB database.
|
||||
"""
|
||||
db = Db(postgresql)
|
||||
db.init()
|
||||
return db
|
|
@ -0,0 +1,19 @@
|
|||
# This file is part of the Open Parts Database software
|
||||
# Copyright (C) 2022 Valentin Lorentz
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify it under the
|
||||
# terms of the GNU Affero General Public License version 3, as published by the
|
||||
# Free Software Foundation.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
|
||||
# PARTICULAR PURPOSE. See the GNU Affero General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License along with
|
||||
# this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
"""
|
||||
Database management
|
||||
"""
|
||||
|
||||
from .db import Db # noqa
|
|
@ -0,0 +1,74 @@
|
|||
# This file is part of the Open Parts Database software
|
||||
# Copyright (C) 2022 Valentin Lorentz
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify it under the
|
||||
# terms of the GNU Affero General Public License version 3, as published by the
|
||||
# Free Software Foundation.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
|
||||
# PARTICULAR PURPOSE. See the GNU Affero General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License along with
|
||||
# this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
"""
|
||||
Abstraction over the postgresql database used by OPDB
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import contextlib
|
||||
import typing
|
||||
|
||||
import psycopg
|
||||
|
||||
from . import models
|
||||
|
||||
|
||||
class Db:
|
||||
"""
|
||||
Abstraction over the postgresql database used by OPDB
|
||||
"""
|
||||
|
||||
def __init__(self, conn: psycopg.Connection):
|
||||
self.conn = conn
|
||||
|
||||
@classmethod
|
||||
@contextlib.contextmanager
|
||||
def open(cls, dsn: str) -> typing.Iterator[Db]:
|
||||
"""
|
||||
Context manager, which yields a :class:`Db` object given a libpq connection
|
||||
string (DSN)
|
||||
"""
|
||||
with psycopg.connect(dsn) as conn:
|
||||
yield Db(conn)
|
||||
|
||||
def init(self) -> None:
|
||||
"""
|
||||
Initializes the schema for the connected database.
|
||||
"""
|
||||
with self.conn.cursor() as cur:
|
||||
for name in dir(models):
|
||||
cls = getattr(models, name)
|
||||
if hasattr(cls, "TABLE"):
|
||||
cur.execute(cls.db_schema())
|
||||
|
||||
def get_last_web_page_snapshot(
|
||||
self, url: str
|
||||
) -> typing.Optional[models.WebPageSnapshot]:
|
||||
"""
|
||||
Returns the last snapshot of the given IRI.
|
||||
"""
|
||||
snapshots = models.WebPageSnapshot.select(
|
||||
self.conn, "WHERE url=%s ORDER BY snapshot_date DESC LIMIT 1", (url,)
|
||||
)
|
||||
return next(snapshots, None)
|
||||
|
||||
def add_web_page_snapshots(
|
||||
self, snapshots: typing.Iterable[models.WebPageSnapshot]
|
||||
) -> None:
|
||||
"""
|
||||
Stores new snapshots of web pages to the database.
|
||||
"""
|
||||
models.WebPageSnapshot.copy_to_db(self.conn, snapshots)
|
|
@ -0,0 +1,70 @@
|
|||
# This file is part of the Open Parts Database software
|
||||
# Copyright (C) 2022 Valentin Lorentz
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify it under the
|
||||
# terms of the GNU Affero General Public License version 3, as published by the
|
||||
# Free Software Foundation.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
|
||||
# PARTICULAR PURPOSE. See the GNU Affero General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License along with
|
||||
# this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
"""
|
||||
Tests basic insertion and retrieval functions
|
||||
"""
|
||||
|
||||
import datetime
|
||||
import random
|
||||
|
||||
from opdb.db import Db, models
|
||||
|
||||
|
||||
def test_missing_web_page_snapshot(opdb_db: Db):
|
||||
"""Tests retrieving a missing web page returns None."""
|
||||
assert opdb_db.get_last_web_page_snapshot("http://nonexistent.org") is None
|
||||
|
||||
|
||||
def test_add_web_page_snapshot(opdb_db: Db):
|
||||
"""Tests adding a web page and that it can be retrieved."""
|
||||
date = datetime.datetime.now(tz=datetime.timezone.utc)
|
||||
snapshots = [
|
||||
models.WebPageSnapshot(
|
||||
url=f"http://example.org/{i}",
|
||||
snapshot_date=datetime.datetime.now(tz=datetime.timezone.utc),
|
||||
snapshot_url=None,
|
||||
retrieved_at=date,
|
||||
retrieved_by="localhost",
|
||||
response_headers={"Content-Length": "7"},
|
||||
content=f"snapshot {i}".encode(),
|
||||
)
|
||||
for i in range(100)
|
||||
]
|
||||
opdb_db.add_web_page_snapshots(snapshots)
|
||||
|
||||
assert opdb_db.get_last_web_page_snapshot("http://example.org/10") == snapshots[10]
|
||||
|
||||
|
||||
def test_get_last_web_page_snapshot(opdb_db: Db):
|
||||
"""Tests adding a web page and that it can be retrieved."""
|
||||
date = datetime.datetime.now(tz=datetime.timezone.utc)
|
||||
|
||||
snapshots = [
|
||||
models.WebPageSnapshot(
|
||||
url="http://example.org/",
|
||||
snapshot_date=datetime.datetime.now(tz=datetime.timezone.utc),
|
||||
snapshot_url=None,
|
||||
retrieved_at=date,
|
||||
retrieved_by="localhost",
|
||||
response_headers={"Content-Length": "7"},
|
||||
content=f"snapshot {i}".encode(),
|
||||
)
|
||||
for i in range(100)
|
||||
]
|
||||
last_snapshot = snapshots[-1]
|
||||
random.shuffle(snapshots)
|
||||
opdb_db.add_web_page_snapshots(snapshots)
|
||||
|
||||
assert opdb_db.get_last_web_page_snapshot("http://example.org/") == last_snapshot
|
|
@ -0,0 +1,55 @@
|
|||
# This file is part of the Open Parts Database software
|
||||
# Copyright (C) 2022 Valentin Lorentz
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify it under the
|
||||
# terms of the GNU Affero General Public License version 3, as published by the
|
||||
# Free Software Foundation.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
|
||||
# PARTICULAR PURPOSE. See the GNU Affero General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License along with
|
||||
# this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
"""
|
||||
Classes representing objects in the postgresql database
|
||||
"""
|
||||
|
||||
import dataclasses
|
||||
import datetime
|
||||
import typing
|
||||
|
||||
from .orm import BaseModel as _BaseModel
|
||||
|
||||
|
||||
@dataclasses.dataclass(frozen=True)
|
||||
class WebPageSnapshot(_BaseModel):
|
||||
"""Local cache of a live webpage"""
|
||||
|
||||
TABLE = "web_page_snapshot"
|
||||
PK = ("url", "snapshot_date")
|
||||
|
||||
url: str
|
||||
"""IRI of the page"""
|
||||
|
||||
snapshot_date: datetime.datetime
|
||||
"""Moment the snapshot was taken from the live website"""
|
||||
|
||||
snapshot_url: typing.Optional[str]
|
||||
"""IRI where the page was downloaded from (:const:`None` unless the snapshot
|
||||
was downloaded from a proxy)."""
|
||||
|
||||
retrieved_at: datetime.datetime
|
||||
"""Moment the snapshot was downloaded by opdb and inserted in the DB (differs from
|
||||
:attr:`snapshot_date` if the snapshot was taken by a proxy)."""
|
||||
|
||||
retrieved_by: str
|
||||
"""Unique string identifying the worker that downloaded the snapshot."""
|
||||
# TODO: define its format
|
||||
|
||||
response_headers: dict[str, str]
|
||||
"""Response headers of the webpage"""
|
||||
|
||||
content: bytes
|
||||
"""Content of the webpage."""
|
|
@ -0,0 +1,51 @@
|
|||
# This file is part of the Open Parts Database software
|
||||
# Copyright (C) 2022 Valentin Lorentz
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify it under the
|
||||
# terms of the GNU Affero General Public License version 3, as published by the
|
||||
# Free Software Foundation.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
|
||||
# PARTICULAR PURPOSE. See the GNU Affero General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License along with
|
||||
# this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
"""
|
||||
Tests the ORM
|
||||
"""
|
||||
|
||||
import datetime
|
||||
|
||||
import pytest
|
||||
|
||||
from opdb.db import models
|
||||
|
||||
|
||||
def test_naive_datetime():
|
||||
"""Tests using a naive datetime as attribute of a model raises an error."""
|
||||
tz_date = datetime.datetime.now(tz=datetime.timezone.utc)
|
||||
naive_date = datetime.datetime.now()
|
||||
|
||||
with pytest.raises(TypeError, match="timezone-aware datetime"):
|
||||
models.WebPageSnapshot(
|
||||
url="http://example.org/",
|
||||
snapshot_date=tz_date,
|
||||
snapshot_url=None,
|
||||
retrieved_at=naive_date,
|
||||
retrieved_by="localhost",
|
||||
response_headers={"Content-Length": b"7"},
|
||||
content=b"foo bar",
|
||||
)
|
||||
|
||||
with pytest.raises(TypeError, match="timezone-aware datetime"):
|
||||
models.WebPageSnapshot(
|
||||
url="http://example.org/",
|
||||
snapshot_date=naive_date,
|
||||
snapshot_url=None,
|
||||
retrieved_at=tz_date,
|
||||
retrieved_by="localhost",
|
||||
response_headers={"Content-Length": b"7"},
|
||||
content=b"foo bar",
|
||||
)
|
|
@ -0,0 +1,172 @@
|
|||
# This file is part of the Open Parts Database software
|
||||
# Copyright (C) 2022 Valentin Lorentz
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify it under the
|
||||
# terms of the GNU Affero General Public License version 3, as published by the
|
||||
# Free Software Foundation.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
|
||||
# PARTICULAR PURPOSE. See the GNU Affero General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License along with
|
||||
# this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
"""
|
||||
A minimalist ORM
|
||||
|
||||
Features:
|
||||
|
||||
* generates postgresql schemas
|
||||
* provides easy access to postgresql's COPY TO (even for jsonb columns)
|
||||
* checks :cls:`datetime.datetime` objects are timezone-aware.
|
||||
"""
|
||||
|
||||
import dataclasses
|
||||
import datetime
|
||||
import json
|
||||
import textwrap
|
||||
import typing
|
||||
|
||||
import psycopg
|
||||
|
||||
_TSelf = typing.TypeVar("_TSelf", bound="BaseModel")
|
||||
|
||||
_TYPE_TO_SQL = {
|
||||
datetime.datetime: "timestamptz",
|
||||
str: "text",
|
||||
bytes: "bytea",
|
||||
dict: "jsonb",
|
||||
}
|
||||
|
||||
|
||||
def _type_to_sql(type_: type, *, nullable=False) -> str:
|
||||
origin = getattr(type_, "__origin__", None)
|
||||
if origin is typing.Union:
|
||||
variants = type_.__args__ # type: ignore[attr-defined]
|
||||
non_none_variants = [
|
||||
variant for variant in variants if not issubclass(variant, type(None))
|
||||
]
|
||||
if len(variants) != 2:
|
||||
raise TypeError(
|
||||
f"Unsupported type: {type_} (expected exactly 2 variants, "
|
||||
f"got {variants!r})"
|
||||
)
|
||||
if len(non_none_variants) != 1:
|
||||
raise TypeError(
|
||||
f"Unsupported type: {type_} (expected exactly 1 non-None variant, "
|
||||
f"got {non_none_variants!r})"
|
||||
)
|
||||
|
||||
(inner_type,) = non_none_variants
|
||||
# type is Optional[inner_type]
|
||||
|
||||
return _type_to_sql(inner_type, nullable=True)
|
||||
elif origin is not None:
|
||||
# another generic type; simply ignore its __args__
|
||||
return _type_to_sql(origin)
|
||||
else:
|
||||
sql_type = _TYPE_TO_SQL[type_]
|
||||
if not nullable:
|
||||
sql_type += " NOT NULL"
|
||||
return sql_type
|
||||
|
||||
|
||||
class BaseModel:
|
||||
"""
|
||||
Base class for all model classes, which provides class methods to generate
|
||||
DB schema and efficiently insert instances.
|
||||
"""
|
||||
|
||||
TABLE: str
|
||||
"""Name of the SQL table."""
|
||||
|
||||
PK: tuple[str, ...]
|
||||
"""Primary key of the SQL table."""
|
||||
|
||||
__DATETIME_FIELD_NAMES: list[str]
|
||||
__JSON_FIELD_NAMES: list[str]
|
||||
|
||||
def __init_subclass__(cls, *args, **kwargs):
|
||||
"""
|
||||
Precomputes ``__DATETIME_FIELD_NAMES`` and ``__JSON_FIELD_NAMES`` on
|
||||
class initialization, so ``__post_init__`` and ``copy_to_db`` do not need
|
||||
to run the whole introspection machinery every time.
|
||||
"""
|
||||
super().__init_subclass__(*args, **kwargs)
|
||||
cls.__DATETIME_FIELD_NAMES = []
|
||||
cls.__JSON_FIELD_NAMES = []
|
||||
for (field_name, field_type) in cls.__annotations__.items():
|
||||
if isinstance(field_type, type):
|
||||
origin = getattr(field_type, "__origin__", None)
|
||||
args = getattr(field_type, "__args__", None)
|
||||
if issubclass(field_type, datetime.datetime) or (
|
||||
origin is typing.Union and datetime.datetime in args
|
||||
):
|
||||
cls.__DATETIME_FIELD_NAMES.append(field_name)
|
||||
if issubclass(field_type, dict) or (
|
||||
origin is not None and issubclass(origin, dict)
|
||||
):
|
||||
cls.__JSON_FIELD_NAMES.append(field_name)
|
||||
return cls
|
||||
|
||||
def __post_init__(self):
|
||||
"""
|
||||
Errors if any of the fields is a naive datetime.
|
||||
"""
|
||||
for field_name in self.__DATETIME_FIELD_NAMES:
|
||||
if getattr(self, field_name).tzinfo is None:
|
||||
raise TypeError(f"{field_name} must be a timezone-aware datetime.")
|
||||
|
||||
@classmethod
|
||||
def copy_to_db(
|
||||
cls: type[_TSelf], conn: psycopg.Connection, objects: typing.Iterable[_TSelf]
|
||||
) -> None:
|
||||
"""
|
||||
Takes a postgresql connection and an iterable of instances,
|
||||
and inserts all the instances efficiently in postgresql.
|
||||
"""
|
||||
cols = [field.name for field in dataclasses.fields(cls)]
|
||||
with conn.cursor() as cur:
|
||||
with cur.copy(f"COPY {cls.TABLE} ({', '.join(cols)}) FROM STDIN") as copy:
|
||||
for obj in objects:
|
||||
row = tuple(
|
||||
json.dumps(getattr(obj, col))
|
||||
if col in cls.__JSON_FIELD_NAMES
|
||||
else getattr(obj, col)
|
||||
for col in cols
|
||||
)
|
||||
copy.write_row(row)
|
||||
|
||||
@classmethod
|
||||
def select(
|
||||
cls: type[_TSelf], conn: psycopg.Connection, clauses: str, params: tuple
|
||||
) -> typing.Iterator[_TSelf]:
|
||||
"""
|
||||
Selects objects from the corresponding table and yields instances of this class.
|
||||
"""
|
||||
with conn.cursor(row_factory=psycopg.rows.class_row(cls)) as cur:
|
||||
cur.execute(f"SELECT * FROM {cls.TABLE} {clauses}", params)
|
||||
yield from cur
|
||||
|
||||
@classmethod
|
||||
def db_schema(cls) -> str:
|
||||
"""
|
||||
Returns SQL code suitable to initialize a table to store instances
|
||||
of this class.
|
||||
"""
|
||||
|
||||
cols = ",\n ".join(
|
||||
f"{field.name} {_type_to_sql(field.type)}"
|
||||
for field in dataclasses.fields(cls)
|
||||
)
|
||||
return textwrap.dedent(
|
||||
f"""\
|
||||
CREATE TABLE IF NOT EXISTS {cls.TABLE} (
|
||||
{cols}
|
||||
);
|
||||
CREATE UNIQUE INDEX IF NOT EXISTS {cls.TABLE}_pk ON {cls.TABLE} (
|
||||
{', '.join(cls.PK)}
|
||||
);
|
||||
"""
|
||||
)
|
|
@ -0,0 +1,41 @@
|
|||
# This file is part of the Open Parts Database software
|
||||
# Copyright (C) 2022 Valentin Lorentz
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify it under the
|
||||
# terms of the GNU Affero General Public License version 3, as published by the
|
||||
# Free Software Foundation.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
|
||||
# PARTICULAR PURPOSE. See the GNU Affero General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License along with
|
||||
# this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
"""
|
||||
Tests the ORM
|
||||
"""
|
||||
|
||||
import textwrap
|
||||
|
||||
from opdb.db import models
|
||||
|
||||
|
||||
def test_db_schema():
|
||||
"""Tests generation of the DB schema for WebPageSnapshot."""
|
||||
assert models.WebPageSnapshot.db_schema() == textwrap.dedent(
|
||||
"""\
|
||||
CREATE TABLE IF NOT EXISTS web_page_snapshot (
|
||||
url text NOT NULL,
|
||||
snapshot_date timestamptz NOT NULL,
|
||||
snapshot_url text,
|
||||
retrieved_at timestamptz NOT NULL,
|
||||
retrieved_by text NOT NULL,
|
||||
response_headers jsonb NOT NULL,
|
||||
content bytea NOT NULL
|
||||
);
|
||||
CREATE UNIQUE INDEX IF NOT EXISTS web_page_snapshot_pk ON web_page_snapshot (
|
||||
url, snapshot_date
|
||||
);
|
||||
"""
|
||||
)
|
|
@ -0,0 +1,245 @@
|
|||
# This file is part of the Open Parts Database software
|
||||
# Copyright (C) 2022 Valentin Lorentz
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify it under the
|
||||
# terms of the GNU Affero General Public License version 3, as published by the
|
||||
# Free Software Foundation.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
|
||||
# PARTICULAR PURPOSE. See the GNU Affero General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License along with
|
||||
# this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
"""
|
||||
Management of the cache of external pages.
|
||||
|
||||
This package fetches web pages, archives them in the Internet Archive for future
|
||||
citation, and caches them in the local database for quick access by other workers.
|
||||
"""
|
||||
|
||||
import datetime
|
||||
import re
|
||||
import socket
|
||||
import time
|
||||
import typing
|
||||
|
||||
import pkg_resources
|
||||
import requests
|
||||
|
||||
from opdb.db import Db, models
|
||||
|
||||
_OPDB_VERSION = pkg_resources.require("opdb")[0].version
|
||||
USER_AGENT = (
|
||||
f"OPDB/{_OPDB_VERSION} (Open Parts Database cacher; +https://git.tf/opdb/opdb)"
|
||||
)
|
||||
|
||||
_wayback_url_re = re.compile(
|
||||
r"^https?://web\.archive\.org/web/(?P<timestamp>[0-9]{14})/(?P<origin_url>.+)$"
|
||||
)
|
||||
|
||||
|
||||
def _datetime_from_ia_timestamp(ia_timestamp: str) -> datetime.datetime:
|
||||
"""
|
||||
>>> _datetime_from_ia_timestamp("20220919233014")
|
||||
datetime.datetime(2022, 9, 19, 23, 30, 14, tzinfo=datetime.timezone.utc)
|
||||
"""
|
||||
dt = datetime.datetime.strptime(ia_timestamp, "%Y%m%d%H%M%S")
|
||||
# Assume it's UTC (neither the Wayback API nor the documentation mention
|
||||
# timezones)
|
||||
return dt.replace(tzinfo=datetime.timezone.utc)
|
||||
|
||||
|
||||
class Session:
|
||||
"""
|
||||
Wrapper for :class:`requests.Session`, which tries to use pages cached locally in
|
||||
the postgresql database, and falls back to downloading; making sure they are
|
||||
archived in the Internet Archive.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, db: Db, min_snapshot_date: datetime.datetime, ias3_auth: str = None
|
||||
):
|
||||
self.min_snapshot_date = min_snapshot_date
|
||||
self._db = db
|
||||
self._session = requests.Session()
|
||||
self._session.headers["User-Agent"] = USER_AGENT
|
||||
self._ias3_auth = ias3_auth
|
||||
|
||||
def _fetch_newest_wayback_snapshot(
|
||||
self, url: str
|
||||
) -> typing.Optional[models.WebPageSnapshot]:
|
||||
"""
|
||||
If the URL is already archived in the Internet Archive (and newer than
|
||||
configured with ``min_snapshot_date``), retrieves the latest snapshot available
|
||||
via the Wayback Machine and returns it.
|
||||
"""
|
||||
# API documentation: https://archive.org/help/wayback_api.php
|
||||
response = self._session.get(
|
||||
"https://archive.org/wayback/available", params={"url": url}
|
||||
)
|
||||
response.raise_for_status() # TODO: retry
|
||||
|
||||
newest_ia_snapshot = (
|
||||
response.json().get("archived_snapshots", {}).get("closest", {})
|
||||
)
|
||||
if not newest_ia_snapshot:
|
||||
return None
|
||||
|
||||
ia_timestamp = newest_ia_snapshot["timestamp"]
|
||||
snapshot_date = _datetime_from_ia_timestamp(ia_timestamp)
|
||||
|
||||
if snapshot_date < self.min_snapshot_date:
|
||||
return None
|
||||
|
||||
wayback_url = newest_ia_snapshot["url"]
|
||||
m = _wayback_url_re.match(wayback_url)
|
||||
assert m, f"Unexpected Wayback URL format: {wayback_url}"
|
||||
assert ia_timestamp == m.group(
|
||||
"timestamp"
|
||||
), "Timestamp unexpectedly missing from snapshot URL: {wayback_url}"
|
||||
|
||||
return self._fetch_wayback_snapshot(url, wayback_url)
|
||||
|
||||
def _fetch_wayback_snapshot(
|
||||
self, url: str, wayback_url: str
|
||||
) -> models.WebPageSnapshot:
|
||||
# Add "id_" after the timestamp in the Wayback URL; it allows fetching the
|
||||
# original page without the navigation header added by the Wayback Machine.
|
||||
# Documented at https://archive.org/post/1044859/
|
||||
m = _wayback_url_re.match(wayback_url)
|
||||
assert m, f"Unexpected Wayback URL format: {wayback_url}"
|
||||
ia_timestamp = m.group("timestamp")
|
||||
snapshot_url = wayback_url.replace(ia_timestamp, ia_timestamp + "id_", 1)
|
||||
|
||||
response = self._session.get(snapshot_url)
|
||||
response.raise_for_status() # TODO: retry
|
||||
|
||||
return models.WebPageSnapshot(
|
||||
url=url,
|
||||
snapshot_date=_datetime_from_ia_timestamp(ia_timestamp),
|
||||
snapshot_url=snapshot_url,
|
||||
retrieved_at=datetime.datetime.now(tz=datetime.timezone.utc),
|
||||
retrieved_by=socket.getfqdn(),
|
||||
response_headers=dict(response.headers),
|
||||
content=response.content,
|
||||
)
|
||||
|
||||
def _save_page_now(self, url: str) -> typing.Optional[models.WebPageSnapshot]:
|
||||
if self._ias3_auth is None:
|
||||
return self._anonymous_save_page_now(url)
|
||||
else:
|
||||
return self._authenticated_save_page_now(url, self._ias3_auth)
|
||||
|
||||
def _anonymous_save_page_now(
|
||||
self, url: str
|
||||
) -> typing.Optional[models.WebPageSnapshot]:
|
||||
while True:
|
||||
try:
|
||||
response = self._session.get(
|
||||
f"https://web.archive.org/save/{url}",
|
||||
allow_redirects=False,
|
||||
)
|
||||
response.raise_for_status()
|
||||
except requests.exceptions.HTTPError as e:
|
||||
if e.response.status_code == 429:
|
||||
print(e)
|
||||
print("Sleeping...")
|
||||
time.sleep(10)
|
||||
continue
|
||||
elif e.response.status_code == 520:
|
||||
# "Job failed". We will try again in the next workflow run.
|
||||
return None
|
||||
else:
|
||||
raise
|
||||
else:
|
||||
wayback_url = response.headers["Location"]
|
||||
return self._fetch_wayback_snapshot(url, wayback_url)
|
||||
|
||||
def _authenticated_save_page_now(
|
||||
self, url: str, ias3_auth: str
|
||||
) -> typing.Optional[models.WebPageSnapshot]:
|
||||
for _ in range(3):
|
||||
response = self._session.post(
|
||||
"https://web.archive.org/save/",
|
||||
allow_redirects=False,
|
||||
data={"url": url},
|
||||
headers={
|
||||
"Accept": "application/json",
|
||||
"Authorization": f"LOW {ias3_auth}",
|
||||
},
|
||||
)
|
||||
response.raise_for_status()
|
||||
if response.json().get("status_ext") == "error:too-many-daily-captures":
|
||||
# typically happens when a page repeatedly fails so we
|
||||
# (unsuccessfully) tried to capture it too many times
|
||||
return None
|
||||
job_id = response.json()["job_id"]
|
||||
|
||||
status = "pending"
|
||||
while status == "pending":
|
||||
time.sleep(5)
|
||||
response = self._session.get(
|
||||
f"https://web.archive.org/save/status/{job_id}"
|
||||
)
|
||||
response.raise_for_status()
|
||||
status = response.json()["status"]
|
||||
|
||||
if status == "success":
|
||||
break
|
||||
|
||||
if response.json()["status"] == "error":
|
||||
print(response.json()["message"])
|
||||
time.sleep(10)
|
||||
continue # retry
|
||||
|
||||
assert False, response.json()
|
||||
else:
|
||||
print("Too many failures; giving up.")
|
||||
return None
|
||||
|
||||
ia_timestamp = response.json()["timestamp"]
|
||||
snapshot_date = _datetime_from_ia_timestamp(ia_timestamp)
|
||||
|
||||
assert snapshot_date >= self.min_snapshot_date, (
|
||||
snapshot_date.isoformat(),
|
||||
self.min_snapshot_date.isoformat(),
|
||||
)
|
||||
|
||||
wayback_url = f"https://web.archive.org/web/{ia_timestamp}/{url}"
|
||||
return self._fetch_wayback_snapshot(url, wayback_url)
|
||||
|
||||
def _get_cached_snapshot(self, url: str) -> typing.Optional[models.WebPageSnapshot]:
|
||||
snapshot = self._db.get_last_web_page_snapshot(url)
|
||||
if snapshot is None or snapshot.snapshot_date < self.min_snapshot_date:
|
||||
return None
|
||||
|
||||
return snapshot
|
||||
|
||||
def get_or_fetch_snapshot(
|
||||
self, url: str
|
||||
) -> typing.Optional[models.WebPageSnapshot]:
|
||||
"""
|
||||
Fetches the given URL from the local cache or from the Wayback Machine.
|
||||
|
||||
Requests archival by the Internet Archive if the Wayback Machine does not
|
||||
already have the page available.
|
||||
"""
|
||||
# First, try the local cache
|
||||
snapshot = self._get_cached_snapshot(url)
|
||||
if snapshot is not None:
|
||||
return snapshot
|
||||
|
||||
# Then, try fetching from the Wayback Machine (and cache it locally)
|
||||
snapshot = self._fetch_newest_wayback_snapshot(url)
|
||||
if snapshot is not None:
|
||||
self._db.add_web_page_snapshots([snapshot])
|
||||
return snapshot
|
||||
|
||||
# If the Internet Archive does not have it yet, trigger its Save Code Now,
|
||||
# and query the Wayback Machine again
|
||||
snapshot = self._save_page_now(url)
|
||||
if snapshot is not None:
|
||||
self._db.add_web_page_snapshots([snapshot])
|
||||
return snapshot
|
|
@ -0,0 +1,395 @@
|
|||
# This file is part of the Open Parts Database software
|
||||
# Copyright (C) 2022 Valentin Lorentz
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify it under the
|
||||
# terms of the GNU Affero General Public License version 3, as published by the
|
||||
# Free Software Foundation.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
|
||||
# PARTICULAR PURPOSE. See the GNU Affero General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License along with
|
||||
# this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
# pylint: disable=redefined-outer-name
|
||||
|
||||
"""
|
||||
Test generic web page retrieval and caching
|
||||
"""
|
||||
|
||||
import datetime
|
||||
import socket
|
||||
|
||||
import pytest
|
||||
import requests_mock
|
||||
|
||||
from opdb.db import Db, models
|
||||
from opdb.web_cache import Session
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def requests_mocker():
|
||||
"""Fixture wrapper for :mod:`requests_mock`"""
|
||||
with requests_mock.mock() as m:
|
||||
yield m
|
||||
|
||||
|
||||
SNAPSHOT_DATE = datetime.datetime(2022, 9, 20, 1, 49, 34, tzinfo=datetime.timezone.utc)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_time(mocker):
|
||||
"""
|
||||
Makes time.sleep return immediately, and returns a callable that returns the
|
||||
total offset introduced by time.sleep calls.
|
||||
"""
|
||||
offset = 0
|
||||
|
||||
def sleep(seconds):
|
||||
nonlocal offset
|
||||
offset += seconds
|
||||
|
||||
mocker.patch("time.sleep", side_effect=sleep)
|
||||
|
||||
def get_offset():
|
||||
return offset
|
||||
|
||||
return get_offset
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def configured_requests_mocker(requests_mocker):
|
||||
"""Extension of :func:`requests_mocker` that registers a bunch of URLs of
|
||||
the Wayback Machine API."""
|
||||
requests_mocker.register_uri(
|
||||
"GET",
|
||||
"https://archive.org/wayback/available?url=http%3A%2F%2Fexample.org%2F",
|
||||
complete_qs=True,
|
||||
json={
|
||||
"url": "http://example.org/",
|
||||
"archived_snapshots": {
|
||||
"closest": {
|
||||
"status": "200",
|
||||
"available": True,
|
||||
"url": (
|
||||
"http://web.archive.org/web/20220920014934/"
|
||||
"http://john.smith@example.org/"
|
||||
),
|
||||
"timestamp": "20220920014934",
|
||||
}
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
requests_mocker.register_uri(
|
||||
"GET",
|
||||
"http://web.archive.org/web/20220920014934id_/http://john.smith@example.org/",
|
||||
complete_qs=True,
|
||||
text="Example page content from Wayback Machine",
|
||||
)
|
||||
|
||||
requests_mocker.register_uri(
|
||||
"GET",
|
||||
"https://web.archive.org/web/20220920164222id_/http://example.org/",
|
||||
complete_qs=True,
|
||||
text="Example page content from Wayback Machine after Save Page Now",
|
||||
)
|
||||
requests_mocker.register_uri(
|
||||
"GET",
|
||||
"https://web.archive.org/web/20220922000000id_/http://example.org/",
|
||||
complete_qs=True,
|
||||
text="Example page content from Wayback Machine after Save Page Now",
|
||||
)
|
||||
|
||||
# Anonymous SPN:
|
||||
requests_mocker.register_uri(
|
||||
"GET",
|
||||
"https://web.archive.org/save/http://example.org/",
|
||||
complete_qs=True,
|
||||
headers={
|
||||
"location": "https://web.archive.org/web/20220922000000/http://example.org/"
|
||||
},
|
||||
text="""
|
||||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
|
||||
<title>Redirecting...</title>
|
||||
<h1>Redirecting...</h1>
|
||||
""",
|
||||
)
|
||||
|
||||
# Authenticated SPN:
|
||||
requests_mocker.register_uri(
|
||||
"POST",
|
||||
"https://web.archive.org/save/",
|
||||
complete_qs=True,
|
||||
text='{"url": "http://example.org/", "job_id": "spn2-abcde"}',
|
||||
)
|
||||
requests_mocker.register_uri(
|
||||
"GET",
|
||||
"https://web.archive.org/save/status/spn2-abcde",
|
||||
[
|
||||
dict(
|
||||
text="""
|
||||
{"job_id":"spn2-abcde","resources":["blah"],"status":"pending"}
|
||||
"""
|
||||
),
|
||||
dict(
|
||||
text="""
|
||||
{"job_id":"spn2-abcde","resources":["blah"],"status":"pending"}
|
||||
"""
|
||||
),
|
||||
dict(
|
||||
text="""
|
||||
{"job_id":"spn2-abcde","resources":["blah"],"status":"success",
|
||||
"timestamp":"20220922000000"}
|
||||
"""
|
||||
),
|
||||
],
|
||||
complete_qs=True,
|
||||
)
|
||||
|
||||
yield requests_mocker
|
||||
|
||||
|
||||
def test_get__cached(configured_requests_mocker, opdb_db: Db):
|
||||
"""
|
||||
Tests getting a snapshot that is already in the local cache
|
||||
"""
|
||||
after_date = SNAPSHOT_DATE - datetime.timedelta(days=1)
|
||||
s = Session(opdb_db, after_date)
|
||||
|
||||
retrieved_at = datetime.datetime(2022, 2, 1, 0, 0, 0, tzinfo=datetime.timezone.utc)
|
||||
|
||||
snapshot = models.WebPageSnapshot(
|
||||
url="http://example.org/",
|
||||
snapshot_date=datetime.datetime.now(tz=datetime.timezone.utc),
|
||||
snapshot_url=None,
|
||||
retrieved_at=retrieved_at,
|
||||
retrieved_by="localhost",
|
||||
response_headers={"Content-Length": "19"},
|
||||
content=b"Example page content",
|
||||
)
|
||||
|
||||
opdb_db.add_web_page_snapshots([snapshot])
|
||||
|
||||
assert s.get_or_fetch_snapshot("http://example.org/") == snapshot
|
||||
|
||||
assert [(r.method, r.url) for r in configured_requests_mocker.request_history] == []
|
||||
|
||||
|
||||
def test_get__uncached__recent_wb(configured_requests_mocker, opdb_db: Db):
|
||||
"""
|
||||
Tests getting a snapshot that is not in the local cache, but is available in
|
||||
the Wayback Machine
|
||||
"""
|
||||
after_date = SNAPSHOT_DATE - datetime.timedelta(days=1)
|
||||
s = Session(opdb_db, after_date)
|
||||
|
||||
dt_before = datetime.datetime.now(tz=datetime.timezone.utc)
|
||||
snapshot = s.get_or_fetch_snapshot("http://example.org/")
|
||||
dt_after = datetime.datetime.now(tz=datetime.timezone.utc)
|
||||
|
||||
assert snapshot is not None
|
||||
assert dt_before <= snapshot.retrieved_at <= dt_after
|
||||
|
||||
assert snapshot == models.WebPageSnapshot(
|
||||
url="http://example.org/",
|
||||
snapshot_date=datetime.datetime(
|
||||
2022, 9, 20, 1, 49, 34, tzinfo=datetime.timezone.utc
|
||||
),
|
||||
snapshot_url=(
|
||||
"http://web.archive.org/web/20220920014934id_/"
|
||||
"http://john.smith@example.org/"
|
||||
),
|
||||
retrieved_at=snapshot.retrieved_at,
|
||||
retrieved_by=socket.getfqdn(),
|
||||
response_headers={},
|
||||
content=b"Example page content from Wayback Machine",
|
||||
)
|
||||
|
||||
assert snapshot == opdb_db.get_last_web_page_snapshot("http://example.org/")
|
||||
|
||||
assert [(r.method, r.url) for r in configured_requests_mocker.request_history] == [
|
||||
(
|
||||
"GET",
|
||||
"https://archive.org/wayback/available?url=http%3A%2F%2Fexample.org%2F",
|
||||
),
|
||||
(
|
||||
"GET",
|
||||
"http://web.archive.org/web/20220920014934id_/"
|
||||
"http://john.smith@example.org/",
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
def test_get__expired_cache__recent_wb(configured_requests_mocker, opdb_db: Db):
|
||||
"""
|
||||
Tests getting a snapshot that is expired in the local cache, but is available in
|
||||
the Wayback Machine
|
||||
"""
|
||||
snapshoted_at = SNAPSHOT_DATE - datetime.timedelta(days=2)
|
||||
retrieved_at = SNAPSHOT_DATE + datetime.timedelta(days=1)
|
||||
|
||||
snapshot = models.WebPageSnapshot(
|
||||
url="http://example.org/",
|
||||
snapshot_date=snapshoted_at,
|
||||
snapshot_url=None,
|
||||
retrieved_at=retrieved_at,
|
||||
retrieved_by="localhost",
|
||||
response_headers={"Content-Length": "16"},
|
||||
content=b"Old page content",
|
||||
)
|
||||
|
||||
opdb_db.add_web_page_snapshots([snapshot])
|
||||
|
||||
# Reuse the other test; web_cache.Session should simply ignore the outdated snapshot
|
||||
test_get__uncached__recent_wb(configured_requests_mocker, opdb_db)
|
||||
|
||||
|
||||
@pytest.fixture(
|
||||
params=[
|
||||
pytest.param(False, id="anonymous"),
|
||||
pytest.param(True, id="authenticated"),
|
||||
],
|
||||
)
|
||||
def authenticated(request) -> bool:
|
||||
"""Parametrized by False/True"""
|
||||
return request.param
|
||||
|
||||
|
||||
def test_get__uncached__expired_wb(
|
||||
configured_requests_mocker, opdb_db: Db, authenticated: bool, mock_time
|
||||
):
|
||||
"""
|
||||
Tests getting a snapshot that is not in the local cache, and expired in
|
||||
the Wayback Machine -> uses Save Page Now
|
||||
"""
|
||||
after_date = SNAPSHOT_DATE + datetime.timedelta(days=1)
|
||||
if authenticated:
|
||||
s = Session(opdb_db, after_date, ias3_auth="akey:skey")
|
||||
else:
|
||||
s = Session(opdb_db, after_date)
|
||||
|
||||
assert mock_time() == 0
|
||||
|
||||
dt_before = datetime.datetime.now(tz=datetime.timezone.utc)
|
||||
snapshot = s.get_or_fetch_snapshot("http://example.org/")
|
||||
dt_after = datetime.datetime.now(tz=datetime.timezone.utc)
|
||||
|
||||
if authenticated:
|
||||
assert mock_time() == 15 # three time.sleep(5) calls
|
||||
else:
|
||||
assert mock_time() == 0 # no sleep() because blocking on SPN GET request
|
||||
|
||||
assert snapshot is not None
|
||||
assert dt_before <= snapshot.retrieved_at <= dt_after
|
||||
|
||||
assert snapshot == models.WebPageSnapshot(
|
||||
url="http://example.org/",
|
||||
snapshot_date=datetime.datetime(
|
||||
2022, 9, 22, 0, 0, 0, tzinfo=datetime.timezone.utc
|
||||
),
|
||||
snapshot_url=(
|
||||
"https://web.archive.org/web/20220922000000id_/" # SPN returns HTTPS URLs
|
||||
"http://example.org/"
|
||||
),
|
||||
retrieved_at=snapshot.retrieved_at,
|
||||
retrieved_by=socket.getfqdn(),
|
||||
response_headers={},
|
||||
content=b"Example page content from Wayback Machine after Save Page Now",
|
||||
)
|
||||
|
||||
assert snapshot == opdb_db.get_last_web_page_snapshot("http://example.org/")
|
||||
|
||||
if authenticated:
|
||||
assert [
|
||||
(r.method, r.url, r.body)
|
||||
for r in configured_requests_mocker.request_history
|
||||
] == [
|
||||
(
|
||||
"GET",
|
||||
"https://archive.org/wayback/available?url=http%3A%2F%2Fexample.org%2F",
|
||||
None,
|
||||
),
|
||||
(
|
||||
"POST",
|
||||
"https://web.archive.org/save/",
|
||||
"url=http%3A%2F%2Fexample.org%2F",
|
||||
),
|
||||
("GET", "https://web.archive.org/save/status/spn2-abcde", None),
|
||||
("GET", "https://web.archive.org/save/status/spn2-abcde", None),
|
||||
("GET", "https://web.archive.org/save/status/spn2-abcde", None),
|
||||
(
|
||||
"GET",
|
||||
"https://web.archive.org/web/20220922000000id_/" # ditto
|
||||
"http://example.org/",
|
||||
None,
|
||||
),
|
||||
]
|
||||
else:
|
||||
assert [
|
||||
(r.method, r.url) for r in configured_requests_mocker.request_history
|
||||
] == [
|
||||
(
|
||||
"GET",
|
||||
"https://archive.org/wayback/available?url=http%3A%2F%2Fexample.org%2F",
|
||||
),
|
||||
(
|
||||
"GET",
|
||||
"https://web.archive.org/save/http://example.org/",
|
||||
),
|
||||
(
|
||||
"GET",
|
||||
"https://web.archive.org/web/20220922000000id_/" # ditto
|
||||
"http://example.org/",
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
def test_get__expired_cache__expired_wb(
|
||||
configured_requests_mocker, opdb_db: Db, authenticated: bool, mock_time
|
||||
):
|
||||
"""
|
||||
Tests getting a snapshot that is expired in the local cache, and expired in
|
||||
the Wayback Machine -> uses Save Page Now
|
||||
"""
|
||||
snapshoted_at = SNAPSHOT_DATE - datetime.timedelta(days=2)
|
||||
retrieved_at = SNAPSHOT_DATE + datetime.timedelta(days=1)
|
||||
|
||||
snapshot = models.WebPageSnapshot(
|
||||
url="http://example.org/",
|
||||
snapshot_date=snapshoted_at,
|
||||
snapshot_url=None,
|
||||
retrieved_at=retrieved_at,
|
||||
retrieved_by="localhost",
|
||||
response_headers={"Content-Length": "16"},
|
||||
content=b"Old page content",
|
||||
)
|
||||
|
||||
opdb_db.add_web_page_snapshots([snapshot])
|
||||
|
||||
# Reuse the other test; web_cache.Session should simply ignore the outdated snapshot
|
||||
test_get__uncached__expired_wb(
|
||||
configured_requests_mocker, opdb_db, authenticated, mock_time
|
||||
)
|
||||
|
||||
|
||||
def test_get__expired_cache__no_wb(
|
||||
configured_requests_mocker, opdb_db: Db, authenticated: bool, mock_time
|
||||
):
|
||||
"""
|
||||
Tests getting a snapshot that is expired in the local cache, and absent from
|
||||
the Wayback Machine -> uses Save Page Now
|
||||
"""
|
||||
configured_requests_mocker.register_uri(
|
||||
"GET",
|
||||
"https://archive.org/wayback/available?url=http%3A%2F%2Fexample.org%2F",
|
||||
complete_qs=True,
|
||||
json={"url": "http://example.org/", "archived_snapshots": {}},
|
||||
)
|
||||
|
||||
# Reuse the other test; web_cache.Session should treat the absence of a page
|
||||
# exactly the same way as an expired one.
|
||||
test_get__uncached__expired_wb(
|
||||
configured_requests_mocker, opdb_db, authenticated, mock_time
|
||||
)
|
|
@ -0,0 +1,59 @@
|
|||
[build-system]
|
||||
requires = ["setuptools"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
name = "opdb"
|
||||
version = "0.0.1"
|
||||
requires-python = ">=3.9"
|
||||
dependencies = [
|
||||
"luigi == 3.*",
|
||||
"psycopg == 3.*",
|
||||
"requests == 2.*",
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
testing = [
|
||||
"pytest",
|
||||
"pytest-mock",
|
||||
"pytest-postgresql",
|
||||
"requests-mock",
|
||||
"types-requests",
|
||||
"types-setuptools",
|
||||
]
|
||||
|
||||
[tool.isort]
|
||||
profile = "black"
|
||||
|
||||
[tool.mypy]
|
||||
python_version = "3.9"
|
||||
|
||||
[[tool.mypy.overrides]]
|
||||
module = [
|
||||
"requests_mock",
|
||||
]
|
||||
ignore_missing_imports = true
|
||||
|
||||
[tool.pylint.format]
|
||||
max-line-length = "88"
|
||||
py-version = "3.9"
|
||||
disable = [
|
||||
# too annoying:
|
||||
"fixme",
|
||||
"invalid-name",
|
||||
"no-else-return",
|
||||
"no-else-continue",
|
||||
"too-few-public-methods",
|
||||
"too-many-instance-attributes",
|
||||
# false positives:
|
||||
"unreachable",
|
||||
"assignment-from-no-return",
|
||||
# mypy does it better:
|
||||
"no-member",
|
||||
"import-error",
|
||||
# flake8 does it already:
|
||||
"line-too-long",
|
||||
]
|
||||
|
||||
[tool.pytest.ini_options]
|
||||
python_files = "*_test.py"
|
Loading…
Reference in New Issue