Initialize with a simple ORM and database to store webpage snapshots
All checks were successful
ci/woodpecker/pr/woodpecker Pipeline was successful
ci/woodpecker/push/woodpecker Pipeline was successful

This commit is contained in:
Val Lorentz 2022-09-17 23:43:54 +02:00
parent 671112566f
commit 95fcb043e8
12 changed files with 475 additions and 11 deletions

View File

@ -20,14 +20,15 @@ pipeline:
test-py3.9:
group: test
image: python:3.9
commands:
commands: &test_commands
- apt-get update
- apt-get install -y postgresql
- pip3 install mypy .[testing]
- make mypy
- make pytest
- adduser pytest
# pytest-postgresql runs pg_ctl, which refuses to run as root
- su pytest -c 'make pytest'
test-py3.10:
group: test
image: python:3.10
commands:
- pip3 install mypy .[testing]
- make mypy
- make pytest
commands: *test_commands

43
opdb/conftest.py Normal file
View File

@ -0,0 +1,43 @@
# This file is part of the Open Parts Database software
# Copyright (C) 2022 Valentin Lorentz
#
# This program is free software: you can redistribute it and/or modify it under the
# terms of the GNU Affero General Public License version 3, as published by the
# Free Software Foundation.
#
# This program is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
# PARTICULAR PURPOSE. See the GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License along with
# this program. If not, see <http://www.gnu.org/licenses/>.
"""
pytest fixtures
"""
import pytest
from opdb.db import Db, models
def iter_subclasses(cls):
"""
Recursively yields all subclasses of a class.
"""
yield cls
for subcls in cls.__subclasses__():
yield from iter_subclasses(subcls)
@pytest.fixture
def opdb_db(postgresql) -> Db:
"""
pytest fixture which yields an empty initialized OPDB database.
"""
with postgresql.cursor() as cur:
for name in dir(models):
cls = getattr(models, name)
if hasattr(cls, "TABLE"):
cur.execute(cls.db_schema())
return Db(postgresql)

20
opdb/db/__init__.py Normal file
View File

@ -0,0 +1,20 @@
# This file is part of the Open Parts Database software
# Copyright (C) 2022 Valentin Lorentz
#
# This program is free software: you can redistribute it and/or modify it under the
# terms of the GNU Affero General Public License version 3, as published by the
# Free Software Foundation.
#
# This program is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
# PARTICULAR PURPOSE. See the GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License along with
# this program. If not, see <http://www.gnu.org/licenses/>.
"""
Database management
"""
from . import models, orm # noqa
from .db import Db # noqa

65
opdb/db/db.py Normal file
View File

@ -0,0 +1,65 @@
# This file is part of the Open Parts Database software
# Copyright (C) 2022 Valentin Lorentz
#
# This program is free software: you can redistribute it and/or modify it under the
# terms of the GNU Affero General Public License version 3, as published by the
# Free Software Foundation.
#
# This program is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
# PARTICULAR PURPOSE. See the GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License along with
# this program. If not, see <http://www.gnu.org/licenses/>.
"""
Abstraction over the postgresql database used by OPDB
"""
from __future__ import annotations
import contextlib
import typing
import psycopg
from . import models
class Db:
"""
Abstraction over the postgresql database used by OPDB
"""
def __init__(self, conn: psycopg.Connection):
self.conn = conn
@classmethod
@contextlib.contextmanager
def open(cls, dsn: str) -> typing.Iterator[Db]:
"""
Context manager, which yields a :class:`Db` object given a libpq connection
string (DSN)
"""
with psycopg.connect(dsn) as conn:
yield Db(conn)
def get_last_web_page_snapshot(
self, url: str
) -> typing.Optional[models.WebPageSnapshot]:
"""
Returns the last snapshot of the given IRI.
"""
with self.conn.cursor(
row_factory=psycopg.rows.class_row(models.WebPageSnapshot)
) as cur:
cur.execute("SELECT * FROM web_page_snapshot WHERE url=%s", (url,))
return cur.fetchone()
def add_web_page_snapshots(
self, snapshots: typing.Iterable[models.WebPageSnapshot]
) -> None:
"""
Stores new snapshots of web pages to the database.
"""
models.WebPageSnapshot.copy_to_db(self.conn, snapshots)

42
opdb/db/db_test.py Normal file
View File

@ -0,0 +1,42 @@
# This file is part of the Open Parts Database software
# Copyright (C) 2022 Valentin Lorentz
#
# This program is free software: you can redistribute it and/or modify it under the
# terms of the GNU Affero General Public License version 3, as published by the
# Free Software Foundation.
#
# This program is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
# PARTICULAR PURPOSE. See the GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License along with
# this program. If not, see <http://www.gnu.org/licenses/>.
"""
Tests basic insertion and retrieval functions
"""
import datetime
from opdb.db import Db, models
def test_missing_web_page_snapshot(opdb_db: Db):
"""Tests retrieving a missing web page returns None."""
assert opdb_db.get_last_web_page_snapshot("http://nonexistent.org") is None
def test_add_web_page_snapshot(opdb_db: Db):
"""Tests adding a web page and that it can be retrieved."""
date = datetime.datetime.now(tz=datetime.timezone.utc)
snapshot = models.WebPageSnapshot(
url="http://example.org/",
snapshot_date=datetime.datetime.now(tz=datetime.timezone.utc),
snapshot_url=None,
retrieved_at=date,
response_headers={"Content-Length": "7"},
content=b"foo bar",
)
opdb_db.add_web_page_snapshots([snapshot])
assert opdb_db.get_last_web_page_snapshot("http://example.org/") == snapshot

51
opdb/db/models.py Normal file
View File

@ -0,0 +1,51 @@
# This file is part of the Open Parts Database software
# Copyright (C) 2022 Valentin Lorentz
#
# This program is free software: you can redistribute it and/or modify it under the
# terms of the GNU Affero General Public License version 3, as published by the
# Free Software Foundation.
#
# This program is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
# PARTICULAR PURPOSE. See the GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License along with
# this program. If not, see <http://www.gnu.org/licenses/>.
"""
Classes representing objects in the postgresql database
"""
import dataclasses
import datetime
import typing
from .orm import BaseModel as _BaseModel
@dataclasses.dataclass(frozen=True)
class WebPageSnapshot(_BaseModel):
"""Local cache of a live webpage"""
TABLE = "web_page_snapshot"
PK = ("url", "snapshot_date")
url: str
"""IRI of the page"""
snapshot_date: datetime.datetime
"""Moment the snapshot was taken from the live website"""
snapshot_url: typing.Optional[str]
"""IRI where the page was downloaded from (:const:`None` unless the snapshot
was downloaded from a proxy)."""
retrieved_at: datetime.datetime
"""Moment the snapshot was downloaded by opdb and inserted in the DB (differs from
:attr:`snapshot_date` if the snapshot was taken by a proxy)."""
response_headers: dict[str, str]
"""Response headers of the webpage"""
content: bytes
"""Content of the webpage."""

49
opdb/db/models_test.py Normal file
View File

@ -0,0 +1,49 @@
# This file is part of the Open Parts Database software
# Copyright (C) 2022 Valentin Lorentz
#
# This program is free software: you can redistribute it and/or modify it under the
# terms of the GNU Affero General Public License version 3, as published by the
# Free Software Foundation.
#
# This program is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
# PARTICULAR PURPOSE. See the GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License along with
# this program. If not, see <http://www.gnu.org/licenses/>.
"""
Tests the ORM
"""
import datetime
import pytest
from opdb.db import models
def test_naive_datetime():
"""Tests using a naive datetime as attribute of a model raises an error."""
tz_date = datetime.datetime.now(tz=datetime.timezone.utc)
naive_date = datetime.datetime.now()
with pytest.raises(TypeError, match="timezone-aware datetime"):
models.WebPageSnapshot(
url="http://example.org/",
snapshot_date=tz_date,
snapshot_url=None,
retrieved_at=naive_date,
response_headers={"Content-Length": b"7"},
content=b"foo bar",
)
with pytest.raises(TypeError, match="timezone-aware datetime"):
models.WebPageSnapshot(
url="http://example.org/",
snapshot_date=naive_date,
snapshot_url=None,
retrieved_at=tz_date,
response_headers={"Content-Length": b"7"},
content=b"foo bar",
)

157
opdb/db/orm.py Normal file
View File

@ -0,0 +1,157 @@
# This file is part of the Open Parts Database software
# Copyright (C) 2022 Valentin Lorentz
#
# This program is free software: you can redistribute it and/or modify it under the
# terms of the GNU Affero General Public License version 3, as published by the
# Free Software Foundation.
#
# This program is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
# PARTICULAR PURPOSE. See the GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License along with
# this program. If not, see <http://www.gnu.org/licenses/>.
"""
A minimalist ORM
Features:
* generates postgresql schemas
* provides easy access to postgresql's COPY TO (even for jsonb columns)
* checks :cls:`datetime.datetime` objects are timezone-aware.
"""
import dataclasses
import datetime
import json
import typing
import psycopg
_TSelf = typing.TypeVar("_TSelf", bound="BaseModel")
_TYPE_TO_SQL = {
datetime.datetime: "timestamptz",
str: "text",
bytes: "bytea",
dict: "jsonb",
}
def _type_to_sql(type_: type, *, nullable=False) -> str:
origin = getattr(type_, "__origin__", None)
if origin is typing.Union:
variants = type_.__args__ # type: ignore[attr-defined]
non_none_variants = [
variant for variant in variants if not issubclass(variant, type(None))
]
if len(variants) != 2:
raise TypeError(
f"Unsupported type: {type_} (expected exactly 2 variants, "
f"got {variants!r})"
)
if len(non_none_variants) != 1:
raise TypeError(
f"Unsupported type: {type_} (expected exactly 1 non-None variant, "
f"got {non_none_variants!r})"
)
(inner_type,) = non_none_variants
# type is Optional[inner_type]
return _type_to_sql(inner_type, nullable=True)
elif origin is not None:
# another generic type; simply ignore its __args__
return _type_to_sql(origin)
else:
sql_type = _TYPE_TO_SQL[type_]
if not nullable:
sql_type += " NOT NULL"
return sql_type
class BaseModel:
"""
Base class for all model classes, which provides class methods to generate
DB schema and efficiently insert instances.
"""
TABLE: str
"""Name of the SQL table."""
PK: tuple[str, ...]
"""Primary key of the SQL table."""
__DATETIME_FIELD_NAMES: list[str]
__JSON_FIELD_NAMES: list[str]
def __init_subclass__(cls, *args, **kwargs):
"""
Precomputes ``__DATETIME_FIELD_NAMES`` and ``__JSON_FIELD_NAMES`` on
class initialization, so ``__post_init__`` and ``copy_to_db`` do not need
to run the whole introspection machinery every time.
"""
super().__init_subclass__(*args, **kwargs)
cls.__DATETIME_FIELD_NAMES = []
cls.__JSON_FIELD_NAMES = []
for (field_name, field_type) in cls.__annotations__.items():
if isinstance(field_type, type):
origin = getattr(field_type, "__origin__", None)
args = getattr(field_type, "__args__", None)
if issubclass(field_type, datetime.datetime) or (
origin is typing.Union and datetime.datetime in args
):
cls.__DATETIME_FIELD_NAMES.append(field_name)
if issubclass(field_type, dict) or (
origin is not None and issubclass(origin, dict)
):
cls.__JSON_FIELD_NAMES.append(field_name)
return cls
def __post_init__(self):
"""
Errors if any of the fields is a naive datetime.
"""
for field_name in self.__DATETIME_FIELD_NAMES:
if getattr(self, field_name).tzinfo is None:
raise TypeError(f"{field_name} must be a timezone-aware datetime.")
@classmethod
def copy_to_db(
cls: type[_TSelf], conn: psycopg.Connection, objects: typing.Iterable[_TSelf]
) -> None:
"""
Takes a postgresql connection and an iterable of instances,
and inserts all the instances efficiently in postgresql.
"""
cols = [field.name for field in dataclasses.fields(cls)]
with conn.cursor() as cur:
with cur.copy(f"COPY {cls.TABLE} ({', '.join(cols)}) FROM STDIN") as copy:
for obj in objects:
row = tuple(
json.dumps(getattr(obj, col))
if col in cls.__JSON_FIELD_NAMES
else getattr(obj, col)
for col in cols
)
copy.write_row(row)
@classmethod
def db_schema(cls) -> str:
"""
Returns SQL code suitable to initialize a table to store instances
of this class.
"""
return "\n".join(
[
f"CREATE TABLE IF NOT EXISTS {cls.TABLE} (",
",\n".join(
f" {field.name} {_type_to_sql(field.type)}"
for field in dataclasses.fields(cls)
),
");",
f"CREATE UNIQUE INDEX IF NOT EXISTS {cls.TABLE}_pk ON {cls.TABLE} "
f"({', '.join(cls.PK)});",
]
)

37
opdb/db/orm_test.py Normal file
View File

@ -0,0 +1,37 @@
# This file is part of the Open Parts Database software
# Copyright (C) 2022 Valentin Lorentz
#
# This program is free software: you can redistribute it and/or modify it under the
# terms of the GNU Affero General Public License version 3, as published by the
# Free Software Foundation.
#
# This program is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
# PARTICULAR PURPOSE. See the GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License along with
# this program. If not, see <http://www.gnu.org/licenses/>.
"""
Tests the ORM
"""
import textwrap
from opdb.db import models
def test_db_schema():
"""Tests generation of the DB schema for WebPageSnapshot."""
assert models.WebPageSnapshot.db_schema() == textwrap.dedent(
"""\
CREATE TABLE IF NOT EXISTS web_page_snapshot (
url text NOT NULL,
snapshot_date timestamptz NOT NULL,
snapshot_url text,
retrieved_at timestamptz NOT NULL,
response_headers jsonb NOT NULL,
content bytea NOT NULL
);
CREATE UNIQUE INDEX IF NOT EXISTS web_page_snapshot_pk ON web_page_snapshot (url, snapshot_date);""" # noqa
)

View File

@ -1,5 +0,0 @@
"""test"""
def test_foo():
"""test"""

View File

@ -33,6 +33,7 @@ disable = [
"invalid-name",
# mypy does it better:
"no-member",
"import-error",
# flake8 does it already:
"line-too-long",
]

3
setup.cfg Normal file
View File

@ -0,0 +1,3 @@
[flake8]
max-line-length = 88
extend-ignore = E203