From f076efffc64b6a32d93260cc80c0c5e43e8ef9cf Mon Sep 17 00:00:00 2001 From: Val Lorentz Date: Sun, 28 May 2023 20:03:55 +0200 Subject: [PATCH] Cache SPARQL queries to a local database --- glowtables/cache.py | 103 +++++++++++++++++++++++++++++++++++ glowtables/sparql.py | 18 ++++-- glowtables/tests/conftest.py | 5 +- 3 files changed, 121 insertions(+), 5 deletions(-) create mode 100644 glowtables/cache.py diff --git a/glowtables/cache.py b/glowtables/cache.py new file mode 100644 index 0000000..dde1b60 --- /dev/null +++ b/glowtables/cache.py @@ -0,0 +1,103 @@ +# This file is part of the Glowtables software +# Copyright (C) 2023 Valentin Lorentz +# +# This program is free software: you can redistribute it and/or modify it under the +# terms of the GNU Affero General Public License version 3, as published by the +# Free Software Foundation. +# +# This program is distributed in the hope that it will be useful, but WITHOUT ANY +# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License along with +# this program. If not, see . + +"""SPARQL query cache""" + +import datetime +import random +import sqlite3 +from typing import Optional + +EXPIRE_PROBA = 0.001 +"""Probability an ``INSERT INTO`` is preceded by a ``DELETE`` of all old records.""" + +CACHE_LIFETIME = datetime.timedelta(days=7) + + +def _now() -> datetime.datetime: + return datetime.datetime.now(tz=datetime.timezone.utc) + + +class Cache: + """A simple key-value cache for SPARQL queries""" + + def __init__(self, db: str): + self._db = sqlite3.connect(db) + self._init_schema() + + def _init_schema(self): + """Initialize tables and indexes""" + with self._db: + self._db.execute( + """ + CREATE TABLE IF NOT EXISTS sparql_queries ( + url TEXT, + query TEXT, + response TEXT, + date TEXT -- ISO8601 timestamp of the recorded query, must be UTC + ); + """ + ) + self._db.execute( + """ + CREATE UNIQUE INDEX IF NOT EXISTS sparql_queries_pk + ON sparql_queries (url, query) + """ + ) + + def _expire(self) -> None: + """Randomly delete outdated item from the database.""" + if random.random() < EXPIRE_PROBA: + with self._db: + self._db.execute( + """ + DELETE FROM sparql_queries WHERE date < ? + """, + ((_now() - CACHE_LIFETIME).isoformat()), + ) + + def get(self, url: str, query: str) -> Optional[str]: + """Gets the response to a previous query from the cache, or None.""" + with self._db: + cur = self._db.execute( + """ + SELECT response + FROM sparql_queries + WHERE url=? AND query=? AND date >= ? + """, + (url, query, (_now() - CACHE_LIFETIME).isoformat()), + ) + rows = list(cur) + if rows: + # cache hit + ((resp,),) = rows + return resp + else: + # cache miss + return None + + def set(self, url: str, query: str, response: str) -> None: + """Adds the response of a query to the cache.""" + self._expire() + with self._db: + self._db.execute( + """ + INSERT INTO sparql_queries(url, query, response, date) + VALUES (?, ?, ?, ?) + ON CONFLICT(url, query) DO UPDATE SET + response=EXCLUDED.response, + date=EXCLUDED.date + """, + (url, query, response, _now().isoformat()), + ) diff --git a/glowtables/sparql.py b/glowtables/sparql.py index 0724fd3..a51d968 100644 --- a/glowtables/sparql.py +++ b/glowtables/sparql.py @@ -15,11 +15,14 @@ """Abstraction over SPARQL backends, primarily meant to be mocked by tests.""" import abc +import json import urllib.parse from typing import Iterable import requests +from .cache import Cache + class SparqlBackend(abc.ABC): """Abstract class for SPARQL clients""" @@ -32,7 +35,7 @@ class SparqlBackend(abc.ABC): class RemoteSparqlBackend(SparqlBackend): """Queries a SPARQL API over HTTP.""" - def __init__(self, url: str, agent: str): + def __init__(self, url: str, agent: str, cache: Cache): """ :param url: Base URL of the endpoint :param agent: User-Agent to use in HTTP requests @@ -40,6 +43,7 @@ class RemoteSparqlBackend(SparqlBackend): self._url = url self._session = requests.Session() self._session.headers["User-Agent"] = agent + self._cache = cache def query(self, query: str) -> Iterable[tuple]: headers = { @@ -47,9 +51,15 @@ class RemoteSparqlBackend(SparqlBackend): "Accept": "application/json", } params = {"query": query} - resp = self._session.post( - self._url, headers=headers, data=urllib.parse.urlencode(params) - ).json() + + resp_text = self._cache.get(self._url, query) + if not resp_text: + resp_text = self._session.post( + self._url, headers=headers, data=urllib.parse.urlencode(params) + ).text + self._cache.set(self._url, query, resp_text) + + resp = json.loads(resp_text) variables = resp["head"]["vars"] for result in resp["results"]["bindings"]: yield tuple(result.get(variable) for variable in variables) diff --git a/glowtables/tests/conftest.py b/glowtables/tests/conftest.py index dc6f69b..cae9857 100644 --- a/glowtables/tests/conftest.py +++ b/glowtables/tests/conftest.py @@ -21,6 +21,7 @@ import urllib.parse import pytest import rdflib +from glowtables.cache import Cache from glowtables.sparql import RemoteSparqlBackend @@ -47,4 +48,6 @@ def rdflib_sparql(requests_mock, rdflib_graph: rdflib.Graph) -> RemoteSparqlBack } requests_mock.register_uri("POST", "mock://sparql.example.org/", json=json_callback) - return RemoteSparqlBackend("mock://sparql.example.org/", agent="Mock Client") + return RemoteSparqlBackend( + "mock://sparql.example.org/", agent="Mock Client", cache=Cache(":memory:") + )