diff --git a/.woodpecker.yml b/.woodpecker.yml index 6cecf48..9b41b4b 100644 --- a/.woodpecker.yml +++ b/.woodpecker.yml @@ -20,14 +20,15 @@ pipeline: test-py3.9: group: test image: python:3.9 - commands: + commands: &test_commands + - apt-get update + - apt-get install -y postgresql - pip3 install mypy .[testing] - make mypy - - make pytest + - adduser pytest + # pytest-postgresql runs pg_ctl, which refuses to run as root + - su pytest -c 'make pytest' test-py3.10: group: test image: python:3.10 - commands: - - pip3 install mypy .[testing] - - make mypy - - make pytest + commands: *test_commands diff --git a/opdb/conftest.py b/opdb/conftest.py new file mode 100644 index 0000000..1a7d0ff --- /dev/null +++ b/opdb/conftest.py @@ -0,0 +1,43 @@ +# This file is part of the Open Parts Database software +# Copyright (C) 2022 Valentin Lorentz +# +# This program is free software: you can redistribute it and/or modify it under the +# terms of the GNU Affero General Public License version 3, as published by the +# Free Software Foundation. +# +# This program is distributed in the hope that it will be useful, but WITHOUT ANY +# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License along with +# this program. If not, see . + +""" +pytest fixtures +""" + +import pytest + +from opdb.db import Db, models + + +def iter_subclasses(cls): + """ + Recursively yields all subclasses of a class. + """ + yield cls + for subcls in cls.__subclasses__(): + yield from iter_subclasses(subcls) + + +@pytest.fixture +def opdb_db(postgresql) -> Db: + """ + pytest fixture which yields an empty initialized OPDB database. + """ + with postgresql.cursor() as cur: + for name in dir(models): + cls = getattr(models, name) + if hasattr(cls, "TABLE"): + cur.execute(cls.db_schema()) + return Db(postgresql) diff --git a/opdb/db/__init__.py b/opdb/db/__init__.py new file mode 100644 index 0000000..429b06d --- /dev/null +++ b/opdb/db/__init__.py @@ -0,0 +1,20 @@ +# This file is part of the Open Parts Database software +# Copyright (C) 2022 Valentin Lorentz +# +# This program is free software: you can redistribute it and/or modify it under the +# terms of the GNU Affero General Public License version 3, as published by the +# Free Software Foundation. +# +# This program is distributed in the hope that it will be useful, but WITHOUT ANY +# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License along with +# this program. If not, see . + +""" +Database management +""" + +from . import models, orm # noqa +from .db import Db # noqa diff --git a/opdb/db/db.py b/opdb/db/db.py new file mode 100644 index 0000000..0d5e679 --- /dev/null +++ b/opdb/db/db.py @@ -0,0 +1,65 @@ +# This file is part of the Open Parts Database software +# Copyright (C) 2022 Valentin Lorentz +# +# This program is free software: you can redistribute it and/or modify it under the +# terms of the GNU Affero General Public License version 3, as published by the +# Free Software Foundation. +# +# This program is distributed in the hope that it will be useful, but WITHOUT ANY +# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License along with +# this program. If not, see . + +""" +Abstraction over the postgresql database used by OPDB +""" + +from __future__ import annotations + +import contextlib +import typing + +import psycopg + +from . import models + + +class Db: + """ + Abstraction over the postgresql database used by OPDB + """ + + def __init__(self, conn: psycopg.Connection): + self.conn = conn + + @classmethod + @contextlib.contextmanager + def open(cls, dsn: str) -> typing.Iterator[Db]: + """ + Context manager, which yields a :class:`Db` object given a libpq connection + string (DSN) + """ + with psycopg.connect(dsn) as conn: + yield Db(conn) + + def get_last_web_page_snapshot( + self, url: str + ) -> typing.Optional[models.WebPageSnapshot]: + """ + Returns the last snapshot of the given IRI. + """ + with self.conn.cursor( + row_factory=psycopg.rows.class_row(models.WebPageSnapshot) + ) as cur: + cur.execute("SELECT * FROM web_page_snapshot WHERE url=%s", (url,)) + return cur.fetchone() + + def add_web_page_snapshots( + self, snapshots: typing.Iterable[models.WebPageSnapshot] + ) -> None: + """ + Stores new snapshots of web pages to the database. + """ + models.WebPageSnapshot.copy_to_db(self.conn, snapshots) diff --git a/opdb/db/db_test.py b/opdb/db/db_test.py new file mode 100644 index 0000000..a9cc31a --- /dev/null +++ b/opdb/db/db_test.py @@ -0,0 +1,42 @@ +# This file is part of the Open Parts Database software +# Copyright (C) 2022 Valentin Lorentz +# +# This program is free software: you can redistribute it and/or modify it under the +# terms of the GNU Affero General Public License version 3, as published by the +# Free Software Foundation. +# +# This program is distributed in the hope that it will be useful, but WITHOUT ANY +# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License along with +# this program. If not, see . + +""" +Tests basic insertion and retrieval functions +""" + +import datetime + +from opdb.db import Db, models + + +def test_missing_web_page_snapshot(opdb_db: Db): + """Tests retrieving a missing web page returns None.""" + assert opdb_db.get_last_web_page_snapshot("http://nonexistent.org") is None + + +def test_add_web_page_snapshot(opdb_db: Db): + """Tests adding a web page and that it can be retrieved.""" + date = datetime.datetime.now(tz=datetime.timezone.utc) + snapshot = models.WebPageSnapshot( + url="http://example.org/", + snapshot_date=datetime.datetime.now(tz=datetime.timezone.utc), + snapshot_url=None, + retrieved_at=date, + response_headers={"Content-Length": "7"}, + content=b"foo bar", + ) + opdb_db.add_web_page_snapshots([snapshot]) + + assert opdb_db.get_last_web_page_snapshot("http://example.org/") == snapshot diff --git a/opdb/db/models.py b/opdb/db/models.py new file mode 100644 index 0000000..11dee3f --- /dev/null +++ b/opdb/db/models.py @@ -0,0 +1,51 @@ +# This file is part of the Open Parts Database software +# Copyright (C) 2022 Valentin Lorentz +# +# This program is free software: you can redistribute it and/or modify it under the +# terms of the GNU Affero General Public License version 3, as published by the +# Free Software Foundation. +# +# This program is distributed in the hope that it will be useful, but WITHOUT ANY +# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License along with +# this program. If not, see . + +""" +Classes representing objects in the postgresql database +""" + +import dataclasses +import datetime +import typing + +from .orm import BaseModel as _BaseModel + + +@dataclasses.dataclass(frozen=True) +class WebPageSnapshot(_BaseModel): + """Local cache of a live webpage""" + + TABLE = "web_page_snapshot" + PK = ("url", "snapshot_date") + + url: str + """IRI of the page""" + + snapshot_date: datetime.datetime + """Moment the snapshot was taken from the live website""" + + snapshot_url: typing.Optional[str] + """IRI where the page was downloaded from (:const:`None` unless the snapshot + was downloaded from a proxy).""" + + retrieved_at: datetime.datetime + """Moment the snapshot was downloaded by opdb and inserted in the DB (differs from + :attr:`snapshot_date` if the snapshot was taken by a proxy).""" + + response_headers: dict[str, str] + """Response headers of the webpage""" + + content: bytes + """Content of the webpage.""" diff --git a/opdb/db/models_test.py b/opdb/db/models_test.py new file mode 100644 index 0000000..80ccb62 --- /dev/null +++ b/opdb/db/models_test.py @@ -0,0 +1,49 @@ +# This file is part of the Open Parts Database software +# Copyright (C) 2022 Valentin Lorentz +# +# This program is free software: you can redistribute it and/or modify it under the +# terms of the GNU Affero General Public License version 3, as published by the +# Free Software Foundation. +# +# This program is distributed in the hope that it will be useful, but WITHOUT ANY +# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License along with +# this program. If not, see . + +""" +Tests the ORM +""" + +import datetime + +import pytest + +from opdb.db import models + + +def test_naive_datetime(): + """Tests using a naive datetime as attribute of a model raises an error.""" + tz_date = datetime.datetime.now(tz=datetime.timezone.utc) + naive_date = datetime.datetime.now() + + with pytest.raises(TypeError, match="timezone-aware datetime"): + models.WebPageSnapshot( + url="http://example.org/", + snapshot_date=tz_date, + snapshot_url=None, + retrieved_at=naive_date, + response_headers={"Content-Length": b"7"}, + content=b"foo bar", + ) + + with pytest.raises(TypeError, match="timezone-aware datetime"): + models.WebPageSnapshot( + url="http://example.org/", + snapshot_date=naive_date, + snapshot_url=None, + retrieved_at=tz_date, + response_headers={"Content-Length": b"7"}, + content=b"foo bar", + ) diff --git a/opdb/db/orm.py b/opdb/db/orm.py new file mode 100644 index 0000000..29ecc7b --- /dev/null +++ b/opdb/db/orm.py @@ -0,0 +1,157 @@ +# This file is part of the Open Parts Database software +# Copyright (C) 2022 Valentin Lorentz +# +# This program is free software: you can redistribute it and/or modify it under the +# terms of the GNU Affero General Public License version 3, as published by the +# Free Software Foundation. +# +# This program is distributed in the hope that it will be useful, but WITHOUT ANY +# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License along with +# this program. If not, see . + +""" +A minimalist ORM + +Features: + +* generates postgresql schemas +* provides easy access to postgresql's COPY TO (even for jsonb columns) +* checks :cls:`datetime.datetime` objects are timezone-aware. +""" + +import dataclasses +import datetime +import json +import typing + +import psycopg + +_TSelf = typing.TypeVar("_TSelf", bound="BaseModel") + +_TYPE_TO_SQL = { + datetime.datetime: "timestamptz", + str: "text", + bytes: "bytea", + dict: "jsonb", +} + + +def _type_to_sql(type_: type, *, nullable=False) -> str: + origin = getattr(type_, "__origin__", None) + if origin is typing.Union: + variants = type_.__args__ # type: ignore[attr-defined] + non_none_variants = [ + variant for variant in variants if not issubclass(variant, type(None)) + ] + if len(variants) != 2: + raise TypeError( + f"Unsupported type: {type_} (expected exactly 2 variants, " + f"got {variants!r})" + ) + if len(non_none_variants) != 1: + raise TypeError( + f"Unsupported type: {type_} (expected exactly 1 non-None variant, " + f"got {non_none_variants!r})" + ) + + (inner_type,) = non_none_variants + # type is Optional[inner_type] + + return _type_to_sql(inner_type, nullable=True) + elif origin is not None: + # another generic type; simply ignore its __args__ + return _type_to_sql(origin) + else: + sql_type = _TYPE_TO_SQL[type_] + if not nullable: + sql_type += " NOT NULL" + return sql_type + + +class BaseModel: + """ + Base class for all model classes, which provides class methods to generate + DB schema and efficiently insert instances. + """ + + TABLE: str + """Name of the SQL table.""" + + PK: tuple[str, ...] + """Primary key of the SQL table.""" + + __DATETIME_FIELD_NAMES: list[str] + __JSON_FIELD_NAMES: list[str] + + def __init_subclass__(cls, *args, **kwargs): + """ + Precomputes ``__DATETIME_FIELD_NAMES`` and ``__JSON_FIELD_NAMES`` on + class initialization, so ``__post_init__`` and ``copy_to_db`` do not need + to run the whole introspection machinery every time. + """ + super().__init_subclass__(*args, **kwargs) + cls.__DATETIME_FIELD_NAMES = [] + cls.__JSON_FIELD_NAMES = [] + for (field_name, field_type) in cls.__annotations__.items(): + if isinstance(field_type, type): + origin = getattr(field_type, "__origin__", None) + args = getattr(field_type, "__args__", None) + if issubclass(field_type, datetime.datetime) or ( + origin is typing.Union and datetime.datetime in args + ): + cls.__DATETIME_FIELD_NAMES.append(field_name) + if issubclass(field_type, dict) or ( + origin is not None and issubclass(origin, dict) + ): + cls.__JSON_FIELD_NAMES.append(field_name) + return cls + + def __post_init__(self): + """ + Errors if any of the fields is a naive datetime. + """ + for field_name in self.__DATETIME_FIELD_NAMES: + if getattr(self, field_name).tzinfo is None: + raise TypeError(f"{field_name} must be a timezone-aware datetime.") + + @classmethod + def copy_to_db( + cls: type[_TSelf], conn: psycopg.Connection, objects: typing.Iterable[_TSelf] + ) -> None: + """ + Takes a postgresql connection and an iterable of instances, + and inserts all the instances efficiently in postgresql. + """ + cols = [field.name for field in dataclasses.fields(cls)] + with conn.cursor() as cur: + with cur.copy(f"COPY {cls.TABLE} ({', '.join(cols)}) FROM STDIN") as copy: + for obj in objects: + row = tuple( + json.dumps(getattr(obj, col)) + if col in cls.__JSON_FIELD_NAMES + else getattr(obj, col) + for col in cols + ) + copy.write_row(row) + + @classmethod + def db_schema(cls) -> str: + """ + Returns SQL code suitable to initialize a table to store instances + of this class. + """ + return "\n".join( + [ + f"CREATE TABLE IF NOT EXISTS {cls.TABLE} (", + ",\n".join( + f" {field.name} {_type_to_sql(field.type)}" + for field in dataclasses.fields(cls) + ), + ");", + f"CREATE UNIQUE INDEX IF NOT EXISTS {cls.TABLE}_pk ON {cls.TABLE} " + f"({', '.join(cls.PK)});", + ] + ) diff --git a/opdb/db/orm_test.py b/opdb/db/orm_test.py new file mode 100644 index 0000000..456021d --- /dev/null +++ b/opdb/db/orm_test.py @@ -0,0 +1,37 @@ +# This file is part of the Open Parts Database software +# Copyright (C) 2022 Valentin Lorentz +# +# This program is free software: you can redistribute it and/or modify it under the +# terms of the GNU Affero General Public License version 3, as published by the +# Free Software Foundation. +# +# This program is distributed in the hope that it will be useful, but WITHOUT ANY +# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License along with +# this program. If not, see . + +""" +Tests the ORM +""" + +import textwrap + +from opdb.db import models + + +def test_db_schema(): + """Tests generation of the DB schema for WebPageSnapshot.""" + assert models.WebPageSnapshot.db_schema() == textwrap.dedent( + """\ + CREATE TABLE IF NOT EXISTS web_page_snapshot ( + url text NOT NULL, + snapshot_date timestamptz NOT NULL, + snapshot_url text, + retrieved_at timestamptz NOT NULL, + response_headers jsonb NOT NULL, + content bytea NOT NULL + ); + CREATE UNIQUE INDEX IF NOT EXISTS web_page_snapshot_pk ON web_page_snapshot (url, snapshot_date);""" # noqa + ) diff --git a/opdb/foo_test.py b/opdb/foo_test.py deleted file mode 100644 index 6d85ef8..0000000 --- a/opdb/foo_test.py +++ /dev/null @@ -1,5 +0,0 @@ -"""test""" - - -def test_foo(): - """test""" diff --git a/pyproject.toml b/pyproject.toml index 4f8eaef..80cabe7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,6 +33,7 @@ disable = [ "invalid-name", # mypy does it better: "no-member", + "import-error", # flake8 does it already: "line-too-long", ] diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..8dd399a --- /dev/null +++ b/setup.cfg @@ -0,0 +1,3 @@ +[flake8] +max-line-length = 88 +extend-ignore = E203