opdb/opdb/web_cache_test.py

396 lines
12 KiB
Python

# This file is part of the Open Parts Database software
# Copyright (C) 2022 Valentin Lorentz
#
# This program is free software: you can redistribute it and/or modify it under the
# terms of the GNU Affero General Public License version 3, as published by the
# Free Software Foundation.
#
# This program is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
# PARTICULAR PURPOSE. See the GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License along with
# this program. If not, see <http://www.gnu.org/licenses/>.
# pylint: disable=redefined-outer-name
"""
Test generic web page retrieval and caching
"""
import datetime
import socket
import pytest
import requests_mock
from opdb.db import Db, models
from opdb.web_cache import Session
@pytest.fixture
def requests_mocker():
"""Fixture wrapper for :mod:`requests_mock`"""
with requests_mock.mock() as m:
yield m
SNAPSHOT_DATE = datetime.datetime(2022, 9, 20, 1, 49, 34, tzinfo=datetime.timezone.utc)
@pytest.fixture
def mock_time(mocker):
"""
Makes time.sleep return immediately, and returns a callable that returns the
total offset introduced by time.sleep calls.
"""
offset = 0
def sleep(seconds):
nonlocal offset
offset += seconds
mocker.patch("time.sleep", side_effect=sleep)
def get_offset():
return offset
return get_offset
@pytest.fixture
def configured_requests_mocker(requests_mocker):
"""Extension of :func:`requests_mocker` that registers a bunch of URLs of
the Wayback Machine API."""
requests_mocker.register_uri(
"GET",
"https://archive.org/wayback/available?url=http%3A%2F%2Fexample.org%2F",
complete_qs=True,
json={
"url": "http://example.org/",
"archived_snapshots": {
"closest": {
"status": "200",
"available": True,
"url": (
"http://web.archive.org/web/20220920014934/"
"http://john.smith@example.org/"
),
"timestamp": "20220920014934",
}
},
},
)
requests_mocker.register_uri(
"GET",
"http://web.archive.org/web/20220920014934id_/http://john.smith@example.org/",
complete_qs=True,
text="Example page content from Wayback Machine",
)
requests_mocker.register_uri(
"GET",
"https://web.archive.org/web/20220920164222id_/http://example.org/",
complete_qs=True,
text="Example page content from Wayback Machine after Save Page Now",
)
requests_mocker.register_uri(
"GET",
"https://web.archive.org/web/20220922000000id_/http://example.org/",
complete_qs=True,
text="Example page content from Wayback Machine after Save Page Now",
)
# Anonymous SPN:
requests_mocker.register_uri(
"GET",
"https://web.archive.org/save/http://example.org/",
complete_qs=True,
headers={
"location": "https://web.archive.org/web/20220922000000/http://example.org/"
},
text="""
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
<title>Redirecting...</title>
<h1>Redirecting...</h1>
""",
)
# Authenticated SPN:
requests_mocker.register_uri(
"POST",
"https://web.archive.org/save/",
complete_qs=True,
text='{"url": "http://example.org/", "job_id": "spn2-abcde"}',
)
requests_mocker.register_uri(
"GET",
"https://web.archive.org/save/status/spn2-abcde",
[
dict(
text="""
{"job_id":"spn2-abcde","resources":["blah"],"status":"pending"}
"""
),
dict(
text="""
{"job_id":"spn2-abcde","resources":["blah"],"status":"pending"}
"""
),
dict(
text="""
{"job_id":"spn2-abcde","resources":["blah"],"status":"success",
"timestamp":"20220922000000"}
"""
),
],
complete_qs=True,
)
yield requests_mocker
def test_get__cached(configured_requests_mocker, opdb_db: Db):
"""
Tests getting a snapshot that is already in the local cache
"""
after_date = SNAPSHOT_DATE - datetime.timedelta(days=1)
s = Session(opdb_db, after_date)
retrieved_at = datetime.datetime(2022, 2, 1, 0, 0, 0, tzinfo=datetime.timezone.utc)
snapshot = models.WebPageSnapshot(
url="http://example.org/",
snapshot_date=datetime.datetime.now(tz=datetime.timezone.utc),
snapshot_url=None,
retrieved_at=retrieved_at,
retrieved_by="localhost",
response_headers={"Content-Length": "19"},
content=b"Example page content",
)
opdb_db.add_web_page_snapshots([snapshot])
assert s.get_or_fetch_snapshot("http://example.org/") == snapshot
assert [(r.method, r.url) for r in configured_requests_mocker.request_history] == []
def test_get__uncached__recent_wb(configured_requests_mocker, opdb_db: Db):
"""
Tests getting a snapshot that is not in the local cache, but is available in
the Wayback Machine
"""
after_date = SNAPSHOT_DATE - datetime.timedelta(days=1)
s = Session(opdb_db, after_date)
dt_before = datetime.datetime.now(tz=datetime.timezone.utc)
snapshot = s.get_or_fetch_snapshot("http://example.org/")
dt_after = datetime.datetime.now(tz=datetime.timezone.utc)
assert snapshot is not None
assert dt_before <= snapshot.retrieved_at <= dt_after
assert snapshot == models.WebPageSnapshot(
url="http://example.org/",
snapshot_date=datetime.datetime(
2022, 9, 20, 1, 49, 34, tzinfo=datetime.timezone.utc
),
snapshot_url=(
"http://web.archive.org/web/20220920014934id_/"
"http://john.smith@example.org/"
),
retrieved_at=snapshot.retrieved_at,
retrieved_by=socket.getfqdn(),
response_headers={},
content=b"Example page content from Wayback Machine",
)
assert snapshot == opdb_db.get_last_web_page_snapshot("http://example.org/")
assert [(r.method, r.url) for r in configured_requests_mocker.request_history] == [
(
"GET",
"https://archive.org/wayback/available?url=http%3A%2F%2Fexample.org%2F",
),
(
"GET",
"http://web.archive.org/web/20220920014934id_/"
"http://john.smith@example.org/",
),
]
def test_get__expired_cache__recent_wb(configured_requests_mocker, opdb_db: Db):
"""
Tests getting a snapshot that is expired in the local cache, but is available in
the Wayback Machine
"""
snapshoted_at = SNAPSHOT_DATE - datetime.timedelta(days=2)
retrieved_at = SNAPSHOT_DATE + datetime.timedelta(days=1)
snapshot = models.WebPageSnapshot(
url="http://example.org/",
snapshot_date=snapshoted_at,
snapshot_url=None,
retrieved_at=retrieved_at,
retrieved_by="localhost",
response_headers={"Content-Length": "16"},
content=b"Old page content",
)
opdb_db.add_web_page_snapshots([snapshot])
# Reuse the other test; web_cache.Session should simply ignore the outdated snapshot
test_get__uncached__recent_wb(configured_requests_mocker, opdb_db)
@pytest.fixture(
params=[
pytest.param(False, id="anonymous"),
pytest.param(True, id="authenticated"),
],
)
def authenticated(request) -> bool:
"""Parametrized by False/True"""
return request.param
def test_get__uncached__expired_wb(
configured_requests_mocker, opdb_db: Db, authenticated: bool, mock_time
):
"""
Tests getting a snapshot that is not in the local cache, and expired in
the Wayback Machine -> uses Save Page Now
"""
after_date = SNAPSHOT_DATE + datetime.timedelta(days=1)
if authenticated:
s = Session(opdb_db, after_date, ias3_auth="akey:skey")
else:
s = Session(opdb_db, after_date)
assert mock_time() == 0
dt_before = datetime.datetime.now(tz=datetime.timezone.utc)
snapshot = s.get_or_fetch_snapshot("http://example.org/")
dt_after = datetime.datetime.now(tz=datetime.timezone.utc)
if authenticated:
assert mock_time() == 15 # three time.sleep(5) calls
else:
assert mock_time() == 0 # no sleep() because blocking on SPN GET request
assert snapshot is not None
assert dt_before <= snapshot.retrieved_at <= dt_after
assert snapshot == models.WebPageSnapshot(
url="http://example.org/",
snapshot_date=datetime.datetime(
2022, 9, 22, 0, 0, 0, tzinfo=datetime.timezone.utc
),
snapshot_url=(
"https://web.archive.org/web/20220922000000id_/" # SPN returns HTTPS URLs
"http://example.org/"
),
retrieved_at=snapshot.retrieved_at,
retrieved_by=socket.getfqdn(),
response_headers={},
content=b"Example page content from Wayback Machine after Save Page Now",
)
assert snapshot == opdb_db.get_last_web_page_snapshot("http://example.org/")
if authenticated:
assert [
(r.method, r.url, r.body)
for r in configured_requests_mocker.request_history
] == [
(
"GET",
"https://archive.org/wayback/available?url=http%3A%2F%2Fexample.org%2F",
None,
),
(
"POST",
"https://web.archive.org/save/",
"url=http%3A%2F%2Fexample.org%2F",
),
("GET", "https://web.archive.org/save/status/spn2-abcde", None),
("GET", "https://web.archive.org/save/status/spn2-abcde", None),
("GET", "https://web.archive.org/save/status/spn2-abcde", None),
(
"GET",
"https://web.archive.org/web/20220922000000id_/" # ditto
"http://example.org/",
None,
),
]
else:
assert [
(r.method, r.url) for r in configured_requests_mocker.request_history
] == [
(
"GET",
"https://archive.org/wayback/available?url=http%3A%2F%2Fexample.org%2F",
),
(
"GET",
"https://web.archive.org/save/http://example.org/",
),
(
"GET",
"https://web.archive.org/web/20220922000000id_/" # ditto
"http://example.org/",
),
]
def test_get__expired_cache__expired_wb(
configured_requests_mocker, opdb_db: Db, authenticated: bool, mock_time
):
"""
Tests getting a snapshot that is expired in the local cache, and expired in
the Wayback Machine -> uses Save Page Now
"""
snapshoted_at = SNAPSHOT_DATE - datetime.timedelta(days=2)
retrieved_at = SNAPSHOT_DATE + datetime.timedelta(days=1)
snapshot = models.WebPageSnapshot(
url="http://example.org/",
snapshot_date=snapshoted_at,
snapshot_url=None,
retrieved_at=retrieved_at,
retrieved_by="localhost",
response_headers={"Content-Length": "16"},
content=b"Old page content",
)
opdb_db.add_web_page_snapshots([snapshot])
# Reuse the other test; web_cache.Session should simply ignore the outdated snapshot
test_get__uncached__expired_wb(
configured_requests_mocker, opdb_db, authenticated, mock_time
)
def test_get__expired_cache__no_wb(
configured_requests_mocker, opdb_db: Db, authenticated: bool, mock_time
):
"""
Tests getting a snapshot that is expired in the local cache, and absent from
the Wayback Machine -> uses Save Page Now
"""
configured_requests_mocker.register_uri(
"GET",
"https://archive.org/wayback/available?url=http%3A%2F%2Fexample.org%2F",
complete_qs=True,
json={"url": "http://example.org/", "archived_snapshots": {}},
)
# Reuse the other test; web_cache.Session should treat the absence of a page
# exactly the same way as an expired one.
test_get__uncached__expired_wb(
configured_requests_mocker, opdb_db, authenticated, mock_time
)