Initialize with a simple ORM and database to store webpage snapshots
This commit is contained in:
parent
671112566f
commit
95fcb043e8
|
@ -20,14 +20,15 @@ pipeline:
|
|||
test-py3.9:
|
||||
group: test
|
||||
image: python:3.9
|
||||
commands:
|
||||
commands: &test_commands
|
||||
- apt-get update
|
||||
- apt-get install -y postgresql
|
||||
- pip3 install mypy .[testing]
|
||||
- make mypy
|
||||
- make pytest
|
||||
- adduser pytest
|
||||
# pytest-postgresql runs pg_ctl, which refuses to run as root
|
||||
- su pytest -c 'make pytest'
|
||||
test-py3.10:
|
||||
group: test
|
||||
image: python:3.10
|
||||
commands:
|
||||
- pip3 install mypy .[testing]
|
||||
- make mypy
|
||||
- make pytest
|
||||
commands: *test_commands
|
||||
|
|
|
@ -0,0 +1,43 @@
|
|||
# This file is part of the Open Parts Database software
|
||||
# Copyright (C) 2022 Valentin Lorentz
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify it under the
|
||||
# terms of the GNU Affero General Public License version 3, as published by the
|
||||
# Free Software Foundation.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
|
||||
# PARTICULAR PURPOSE. See the GNU Affero General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License along with
|
||||
# this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
"""
|
||||
pytest fixtures
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
||||
from opdb.db import Db, models
|
||||
|
||||
|
||||
def iter_subclasses(cls):
|
||||
"""
|
||||
Recursively yields all subclasses of a class.
|
||||
"""
|
||||
yield cls
|
||||
for subcls in cls.__subclasses__():
|
||||
yield from iter_subclasses(subcls)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def opdb_db(postgresql) -> Db:
|
||||
"""
|
||||
pytest fixture which yields an empty initialized OPDB database.
|
||||
"""
|
||||
with postgresql.cursor() as cur:
|
||||
for name in dir(models):
|
||||
cls = getattr(models, name)
|
||||
if hasattr(cls, "TABLE"):
|
||||
cur.execute(cls.db_schema())
|
||||
return Db(postgresql)
|
|
@ -0,0 +1,20 @@
|
|||
# This file is part of the Open Parts Database software
|
||||
# Copyright (C) 2022 Valentin Lorentz
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify it under the
|
||||
# terms of the GNU Affero General Public License version 3, as published by the
|
||||
# Free Software Foundation.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
|
||||
# PARTICULAR PURPOSE. See the GNU Affero General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License along with
|
||||
# this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
"""
|
||||
Database management
|
||||
"""
|
||||
|
||||
from . import models, orm # noqa
|
||||
from .db import Db # noqa
|
|
@ -0,0 +1,65 @@
|
|||
# This file is part of the Open Parts Database software
|
||||
# Copyright (C) 2022 Valentin Lorentz
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify it under the
|
||||
# terms of the GNU Affero General Public License version 3, as published by the
|
||||
# Free Software Foundation.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
|
||||
# PARTICULAR PURPOSE. See the GNU Affero General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License along with
|
||||
# this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
"""
|
||||
Abstraction over the postgresql database used by OPDB
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import contextlib
|
||||
import typing
|
||||
|
||||
import psycopg
|
||||
|
||||
from . import models
|
||||
|
||||
|
||||
class Db:
|
||||
"""
|
||||
Abstraction over the postgresql database used by OPDB
|
||||
"""
|
||||
|
||||
def __init__(self, conn: psycopg.Connection):
|
||||
self.conn = conn
|
||||
|
||||
@classmethod
|
||||
@contextlib.contextmanager
|
||||
def open(cls, dsn: str) -> typing.Iterator[Db]:
|
||||
"""
|
||||
Context manager, which yields a :class:`Db` object given a libpq connection
|
||||
string (DSN)
|
||||
"""
|
||||
with psycopg.connect(dsn) as conn:
|
||||
yield Db(conn)
|
||||
|
||||
def get_last_web_page_snapshot(
|
||||
self, url: str
|
||||
) -> typing.Optional[models.WebPageSnapshot]:
|
||||
"""
|
||||
Returns the last snapshot of the given IRI.
|
||||
"""
|
||||
with self.conn.cursor(
|
||||
row_factory=psycopg.rows.class_row(models.WebPageSnapshot)
|
||||
) as cur:
|
||||
cur.execute("SELECT * FROM web_page_snapshot WHERE url=%s", (url,))
|
||||
return cur.fetchone()
|
||||
|
||||
def add_web_page_snapshots(
|
||||
self, snapshots: typing.Iterable[models.WebPageSnapshot]
|
||||
) -> None:
|
||||
"""
|
||||
Stores new snapshots of web pages to the database.
|
||||
"""
|
||||
models.WebPageSnapshot.copy_to_db(self.conn, snapshots)
|
|
@ -0,0 +1,42 @@
|
|||
# This file is part of the Open Parts Database software
|
||||
# Copyright (C) 2022 Valentin Lorentz
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify it under the
|
||||
# terms of the GNU Affero General Public License version 3, as published by the
|
||||
# Free Software Foundation.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
|
||||
# PARTICULAR PURPOSE. See the GNU Affero General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License along with
|
||||
# this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
"""
|
||||
Tests basic insertion and retrieval functions
|
||||
"""
|
||||
|
||||
import datetime
|
||||
|
||||
from opdb.db import Db, models
|
||||
|
||||
|
||||
def test_missing_web_page_snapshot(opdb_db: Db):
|
||||
"""Tests retrieving a missing web page returns None."""
|
||||
assert opdb_db.get_last_web_page_snapshot("http://nonexistent.org") is None
|
||||
|
||||
|
||||
def test_add_web_page_snapshot(opdb_db: Db):
|
||||
"""Tests adding a web page and that it can be retrieved."""
|
||||
date = datetime.datetime.now(tz=datetime.timezone.utc)
|
||||
snapshot = models.WebPageSnapshot(
|
||||
url="http://example.org/",
|
||||
snapshot_date=datetime.datetime.now(tz=datetime.timezone.utc),
|
||||
snapshot_url=None,
|
||||
retrieved_at=date,
|
||||
response_headers={"Content-Length": "7"},
|
||||
content=b"foo bar",
|
||||
)
|
||||
opdb_db.add_web_page_snapshots([snapshot])
|
||||
|
||||
assert opdb_db.get_last_web_page_snapshot("http://example.org/") == snapshot
|
|
@ -0,0 +1,51 @@
|
|||
# This file is part of the Open Parts Database software
|
||||
# Copyright (C) 2022 Valentin Lorentz
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify it under the
|
||||
# terms of the GNU Affero General Public License version 3, as published by the
|
||||
# Free Software Foundation.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
|
||||
# PARTICULAR PURPOSE. See the GNU Affero General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License along with
|
||||
# this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
"""
|
||||
Classes representing objects in the postgresql database
|
||||
"""
|
||||
|
||||
import dataclasses
|
||||
import datetime
|
||||
import typing
|
||||
|
||||
from .orm import BaseModel as _BaseModel
|
||||
|
||||
|
||||
@dataclasses.dataclass(frozen=True)
|
||||
class WebPageSnapshot(_BaseModel):
|
||||
"""Local cache of a live webpage"""
|
||||
|
||||
TABLE = "web_page_snapshot"
|
||||
PK = ("url", "snapshot_date")
|
||||
|
||||
url: str
|
||||
"""IRI of the page"""
|
||||
|
||||
snapshot_date: datetime.datetime
|
||||
"""Moment the snapshot was taken from the live website"""
|
||||
|
||||
snapshot_url: typing.Optional[str]
|
||||
"""IRI where the page was downloaded from (:const:`None` unless the snapshot
|
||||
was downloaded from a proxy)."""
|
||||
|
||||
retrieved_at: datetime.datetime
|
||||
"""Moment the snapshot was downloaded by opdb and inserted in the DB (differs from
|
||||
:attr:`snapshot_date` if the snapshot was taken by a proxy)."""
|
||||
|
||||
response_headers: dict[str, str]
|
||||
"""Response headers of the webpage"""
|
||||
|
||||
content: bytes
|
||||
"""Content of the webpage."""
|
|
@ -0,0 +1,49 @@
|
|||
# This file is part of the Open Parts Database software
|
||||
# Copyright (C) 2022 Valentin Lorentz
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify it under the
|
||||
# terms of the GNU Affero General Public License version 3, as published by the
|
||||
# Free Software Foundation.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
|
||||
# PARTICULAR PURPOSE. See the GNU Affero General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License along with
|
||||
# this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
"""
|
||||
Tests the ORM
|
||||
"""
|
||||
|
||||
import datetime
|
||||
|
||||
import pytest
|
||||
|
||||
from opdb.db import models
|
||||
|
||||
|
||||
def test_naive_datetime():
|
||||
"""Tests using a naive datetime as attribute of a model raises an error."""
|
||||
tz_date = datetime.datetime.now(tz=datetime.timezone.utc)
|
||||
naive_date = datetime.datetime.now()
|
||||
|
||||
with pytest.raises(TypeError, match="timezone-aware datetime"):
|
||||
models.WebPageSnapshot(
|
||||
url="http://example.org/",
|
||||
snapshot_date=tz_date,
|
||||
snapshot_url=None,
|
||||
retrieved_at=naive_date,
|
||||
response_headers={"Content-Length": b"7"},
|
||||
content=b"foo bar",
|
||||
)
|
||||
|
||||
with pytest.raises(TypeError, match="timezone-aware datetime"):
|
||||
models.WebPageSnapshot(
|
||||
url="http://example.org/",
|
||||
snapshot_date=naive_date,
|
||||
snapshot_url=None,
|
||||
retrieved_at=tz_date,
|
||||
response_headers={"Content-Length": b"7"},
|
||||
content=b"foo bar",
|
||||
)
|
|
@ -0,0 +1,157 @@
|
|||
# This file is part of the Open Parts Database software
|
||||
# Copyright (C) 2022 Valentin Lorentz
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify it under the
|
||||
# terms of the GNU Affero General Public License version 3, as published by the
|
||||
# Free Software Foundation.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
|
||||
# PARTICULAR PURPOSE. See the GNU Affero General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License along with
|
||||
# this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
"""
|
||||
A minimalist ORM
|
||||
|
||||
Features:
|
||||
|
||||
* generates postgresql schemas
|
||||
* provides easy access to postgresql's COPY TO (even for jsonb columns)
|
||||
* checks :cls:`datetime.datetime` objects are timezone-aware.
|
||||
"""
|
||||
|
||||
import dataclasses
|
||||
import datetime
|
||||
import json
|
||||
import typing
|
||||
|
||||
import psycopg
|
||||
|
||||
_TSelf = typing.TypeVar("_TSelf", bound="BaseModel")
|
||||
|
||||
_TYPE_TO_SQL = {
|
||||
datetime.datetime: "timestamptz",
|
||||
str: "text",
|
||||
bytes: "bytea",
|
||||
dict: "jsonb",
|
||||
}
|
||||
|
||||
|
||||
def _type_to_sql(type_: type, *, nullable=False) -> str:
|
||||
origin = getattr(type_, "__origin__", None)
|
||||
if origin is typing.Union:
|
||||
variants = type_.__args__ # type: ignore[attr-defined]
|
||||
non_none_variants = [
|
||||
variant for variant in variants if not issubclass(variant, type(None))
|
||||
]
|
||||
if len(variants) != 2:
|
||||
raise TypeError(
|
||||
f"Unsupported type: {type_} (expected exactly 2 variants, "
|
||||
f"got {variants!r})"
|
||||
)
|
||||
if len(non_none_variants) != 1:
|
||||
raise TypeError(
|
||||
f"Unsupported type: {type_} (expected exactly 1 non-None variant, "
|
||||
f"got {non_none_variants!r})"
|
||||
)
|
||||
|
||||
(inner_type,) = non_none_variants
|
||||
# type is Optional[inner_type]
|
||||
|
||||
return _type_to_sql(inner_type, nullable=True)
|
||||
elif origin is not None:
|
||||
# another generic type; simply ignore its __args__
|
||||
return _type_to_sql(origin)
|
||||
else:
|
||||
sql_type = _TYPE_TO_SQL[type_]
|
||||
if not nullable:
|
||||
sql_type += " NOT NULL"
|
||||
return sql_type
|
||||
|
||||
|
||||
class BaseModel:
|
||||
"""
|
||||
Base class for all model classes, which provides class methods to generate
|
||||
DB schema and efficiently insert instances.
|
||||
"""
|
||||
|
||||
TABLE: str
|
||||
"""Name of the SQL table."""
|
||||
|
||||
PK: tuple[str, ...]
|
||||
"""Primary key of the SQL table."""
|
||||
|
||||
__DATETIME_FIELD_NAMES: list[str]
|
||||
__JSON_FIELD_NAMES: list[str]
|
||||
|
||||
def __init_subclass__(cls, *args, **kwargs):
|
||||
"""
|
||||
Precomputes ``__DATETIME_FIELD_NAMES`` and ``__JSON_FIELD_NAMES`` on
|
||||
class initialization, so ``__post_init__`` and ``copy_to_db`` do not need
|
||||
to run the whole introspection machinery every time.
|
||||
"""
|
||||
super().__init_subclass__(*args, **kwargs)
|
||||
cls.__DATETIME_FIELD_NAMES = []
|
||||
cls.__JSON_FIELD_NAMES = []
|
||||
for (field_name, field_type) in cls.__annotations__.items():
|
||||
if isinstance(field_type, type):
|
||||
origin = getattr(field_type, "__origin__", None)
|
||||
args = getattr(field_type, "__args__", None)
|
||||
if issubclass(field_type, datetime.datetime) or (
|
||||
origin is typing.Union and datetime.datetime in args
|
||||
):
|
||||
cls.__DATETIME_FIELD_NAMES.append(field_name)
|
||||
if issubclass(field_type, dict) or (
|
||||
origin is not None and issubclass(origin, dict)
|
||||
):
|
||||
cls.__JSON_FIELD_NAMES.append(field_name)
|
||||
return cls
|
||||
|
||||
def __post_init__(self):
|
||||
"""
|
||||
Errors if any of the fields is a naive datetime.
|
||||
"""
|
||||
for field_name in self.__DATETIME_FIELD_NAMES:
|
||||
if getattr(self, field_name).tzinfo is None:
|
||||
raise TypeError(f"{field_name} must be a timezone-aware datetime.")
|
||||
|
||||
@classmethod
|
||||
def copy_to_db(
|
||||
cls: type[_TSelf], conn: psycopg.Connection, objects: typing.Iterable[_TSelf]
|
||||
) -> None:
|
||||
"""
|
||||
Takes a postgresql connection and an iterable of instances,
|
||||
and inserts all the instances efficiently in postgresql.
|
||||
"""
|
||||
cols = [field.name for field in dataclasses.fields(cls)]
|
||||
with conn.cursor() as cur:
|
||||
with cur.copy(f"COPY {cls.TABLE} ({', '.join(cols)}) FROM STDIN") as copy:
|
||||
for obj in objects:
|
||||
row = tuple(
|
||||
json.dumps(getattr(obj, col))
|
||||
if col in cls.__JSON_FIELD_NAMES
|
||||
else getattr(obj, col)
|
||||
for col in cols
|
||||
)
|
||||
copy.write_row(row)
|
||||
|
||||
@classmethod
|
||||
def db_schema(cls) -> str:
|
||||
"""
|
||||
Returns SQL code suitable to initialize a table to store instances
|
||||
of this class.
|
||||
"""
|
||||
return "\n".join(
|
||||
[
|
||||
f"CREATE TABLE IF NOT EXISTS {cls.TABLE} (",
|
||||
",\n".join(
|
||||
f" {field.name} {_type_to_sql(field.type)}"
|
||||
for field in dataclasses.fields(cls)
|
||||
),
|
||||
");",
|
||||
f"CREATE UNIQUE INDEX IF NOT EXISTS {cls.TABLE}_pk ON {cls.TABLE} "
|
||||
f"({', '.join(cls.PK)});",
|
||||
]
|
||||
)
|
|
@ -0,0 +1,37 @@
|
|||
# This file is part of the Open Parts Database software
|
||||
# Copyright (C) 2022 Valentin Lorentz
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify it under the
|
||||
# terms of the GNU Affero General Public License version 3, as published by the
|
||||
# Free Software Foundation.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
|
||||
# PARTICULAR PURPOSE. See the GNU Affero General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License along with
|
||||
# this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
"""
|
||||
Tests the ORM
|
||||
"""
|
||||
|
||||
import textwrap
|
||||
|
||||
from opdb.db import models
|
||||
|
||||
|
||||
def test_db_schema():
|
||||
"""Tests generation of the DB schema for WebPageSnapshot."""
|
||||
assert models.WebPageSnapshot.db_schema() == textwrap.dedent(
|
||||
"""\
|
||||
CREATE TABLE IF NOT EXISTS web_page_snapshot (
|
||||
url text NOT NULL,
|
||||
snapshot_date timestamptz NOT NULL,
|
||||
snapshot_url text,
|
||||
retrieved_at timestamptz NOT NULL,
|
||||
response_headers jsonb NOT NULL,
|
||||
content bytea NOT NULL
|
||||
);
|
||||
CREATE UNIQUE INDEX IF NOT EXISTS web_page_snapshot_pk ON web_page_snapshot (url, snapshot_date);""" # noqa
|
||||
)
|
|
@ -1,5 +0,0 @@
|
|||
"""test"""
|
||||
|
||||
|
||||
def test_foo():
|
||||
"""test"""
|
|
@ -33,6 +33,7 @@ disable = [
|
|||
"invalid-name",
|
||||
# mypy does it better:
|
||||
"no-member",
|
||||
"import-error",
|
||||
# flake8 does it already:
|
||||
"line-too-long",
|
||||
]
|
||||
|
|
Loading…
Reference in New Issue