# This file is part of the Open Parts Database software # Copyright (C) 2022 Valentin Lorentz # # This program is free software: you can redistribute it and/or modify it under the # terms of the GNU Affero General Public License version 3, as published by the # Free Software Foundation. # # This program is distributed in the hope that it will be useful, but WITHOUT ANY # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A # PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License along with # this program. If not, see . # pylint: disable=redefined-outer-name """ Test generic web page retrieval and caching """ import datetime import socket import pytest import requests_mock from opdb.db import Db, models from opdb.web_cache import Session @pytest.fixture def requests_mocker(): """Fixture wrapper for :mod:`requests_mock`""" with requests_mock.mock() as m: yield m SNAPSHOT_DATE = datetime.datetime(2022, 9, 20, 1, 49, 34, tzinfo=datetime.timezone.utc) @pytest.fixture def mock_time(mocker): """ Makes time.sleep return immediately, and returns a callable that returns the total offset introduced by time.sleep calls. """ offset = 0 def sleep(seconds): nonlocal offset offset += seconds mocker.patch("time.sleep", side_effect=sleep) def get_offset(): return offset return get_offset @pytest.fixture def configured_requests_mocker(requests_mocker): """Extension of :func:`requests_mocker` that registers a bunch of URLs of the Wayback Machine API.""" requests_mocker.register_uri( "GET", "https://archive.org/wayback/available?url=http%3A%2F%2Fexample.org%2F", complete_qs=True, json={ "url": "http://example.org/", "archived_snapshots": { "closest": { "status": "200", "available": True, "url": ( "http://web.archive.org/web/20220920014934/" "http://john.smith@example.org/" ), "timestamp": "20220920014934", } }, }, ) requests_mocker.register_uri( "GET", "http://web.archive.org/web/20220920014934id_/http://john.smith@example.org/", complete_qs=True, text="Example page content from Wayback Machine", ) requests_mocker.register_uri( "GET", "https://web.archive.org/web/20220920164222id_/http://example.org/", complete_qs=True, text="Example page content from Wayback Machine after Save Page Now", ) requests_mocker.register_uri( "GET", "https://web.archive.org/web/20220922000000id_/http://example.org/", complete_qs=True, text="Example page content from Wayback Machine after Save Page Now", ) # Anonymous SPN: requests_mocker.register_uri( "GET", "https://web.archive.org/save/http://example.org/", complete_qs=True, headers={ "location": "https://web.archive.org/web/20220922000000/http://example.org/" }, text=""" Redirecting...

Redirecting...

""", ) # Authenticated SPN: requests_mocker.register_uri( "POST", "https://web.archive.org/save/", complete_qs=True, text='{"url": "http://example.org/", "job_id": "spn2-abcde"}', ) requests_mocker.register_uri( "GET", "https://web.archive.org/save/status/spn2-abcde", [ dict( text=""" {"job_id":"spn2-abcde","resources":["blah"],"status":"pending"} """ ), dict( text=""" {"job_id":"spn2-abcde","resources":["blah"],"status":"pending"} """ ), dict( text=""" {"job_id":"spn2-abcde","resources":["blah"],"status":"success", "timestamp":"20220922000000"} """ ), ], complete_qs=True, ) yield requests_mocker def test_get__cached(configured_requests_mocker, opdb_db: Db): """ Tests getting a snapshot that is already in the local cache """ after_date = SNAPSHOT_DATE - datetime.timedelta(days=1) s = Session(opdb_db, after_date) retrieved_at = datetime.datetime(2022, 2, 1, 0, 0, 0, tzinfo=datetime.timezone.utc) snapshot = models.WebPageSnapshot( url="http://example.org/", snapshot_date=datetime.datetime.now(tz=datetime.timezone.utc), snapshot_url=None, retrieved_at=retrieved_at, retrieved_by="localhost", response_headers={"Content-Length": "19"}, content=b"Example page content", ) opdb_db.add_web_page_snapshots([snapshot]) assert s.get_or_fetch_snapshot("http://example.org/") == snapshot assert [(r.method, r.url) for r in configured_requests_mocker.request_history] == [] def test_get__uncached__recent_wb(configured_requests_mocker, opdb_db: Db): """ Tests getting a snapshot that is not in the local cache, but is available in the Wayback Machine """ after_date = SNAPSHOT_DATE - datetime.timedelta(days=1) s = Session(opdb_db, after_date) dt_before = datetime.datetime.now(tz=datetime.timezone.utc) snapshot = s.get_or_fetch_snapshot("http://example.org/") dt_after = datetime.datetime.now(tz=datetime.timezone.utc) assert snapshot is not None assert dt_before <= snapshot.retrieved_at <= dt_after assert snapshot == models.WebPageSnapshot( url="http://example.org/", snapshot_date=datetime.datetime( 2022, 9, 20, 1, 49, 34, tzinfo=datetime.timezone.utc ), snapshot_url=( "http://web.archive.org/web/20220920014934id_/" "http://john.smith@example.org/" ), retrieved_at=snapshot.retrieved_at, retrieved_by=socket.getfqdn(), response_headers={}, content=b"Example page content from Wayback Machine", ) assert snapshot == opdb_db.get_last_web_page_snapshot("http://example.org/") assert [(r.method, r.url) for r in configured_requests_mocker.request_history] == [ ( "GET", "https://archive.org/wayback/available?url=http%3A%2F%2Fexample.org%2F", ), ( "GET", "http://web.archive.org/web/20220920014934id_/" "http://john.smith@example.org/", ), ] def test_get__expired_cache__recent_wb(configured_requests_mocker, opdb_db: Db): """ Tests getting a snapshot that is expired in the local cache, but is available in the Wayback Machine """ snapshoted_at = SNAPSHOT_DATE - datetime.timedelta(days=2) retrieved_at = SNAPSHOT_DATE + datetime.timedelta(days=1) snapshot = models.WebPageSnapshot( url="http://example.org/", snapshot_date=snapshoted_at, snapshot_url=None, retrieved_at=retrieved_at, retrieved_by="localhost", response_headers={"Content-Length": "16"}, content=b"Old page content", ) opdb_db.add_web_page_snapshots([snapshot]) # Reuse the other test; web_cache.Session should simply ignore the outdated snapshot test_get__uncached__recent_wb(configured_requests_mocker, opdb_db) @pytest.fixture( params=[ pytest.param(False, id="anonymous"), pytest.param(True, id="authenticated"), ], ) def authenticated(request) -> bool: """Parametrized by False/True""" return request.param def test_get__uncached__expired_wb( configured_requests_mocker, opdb_db: Db, authenticated: bool, mock_time ): """ Tests getting a snapshot that is not in the local cache, and expired in the Wayback Machine -> uses Save Page Now """ after_date = SNAPSHOT_DATE + datetime.timedelta(days=1) if authenticated: s = Session(opdb_db, after_date, ias3_auth="akey:skey") else: s = Session(opdb_db, after_date) assert mock_time() == 0 dt_before = datetime.datetime.now(tz=datetime.timezone.utc) snapshot = s.get_or_fetch_snapshot("http://example.org/") dt_after = datetime.datetime.now(tz=datetime.timezone.utc) if authenticated: assert mock_time() == 15 # three time.sleep(5) calls else: assert mock_time() == 0 # no sleep() because blocking on SPN GET request assert snapshot is not None assert dt_before <= snapshot.retrieved_at <= dt_after assert snapshot == models.WebPageSnapshot( url="http://example.org/", snapshot_date=datetime.datetime( 2022, 9, 22, 0, 0, 0, tzinfo=datetime.timezone.utc ), snapshot_url=( "https://web.archive.org/web/20220922000000id_/" # SPN returns HTTPS URLs "http://example.org/" ), retrieved_at=snapshot.retrieved_at, retrieved_by=socket.getfqdn(), response_headers={}, content=b"Example page content from Wayback Machine after Save Page Now", ) assert snapshot == opdb_db.get_last_web_page_snapshot("http://example.org/") if authenticated: assert [ (r.method, r.url, r.body) for r in configured_requests_mocker.request_history ] == [ ( "GET", "https://archive.org/wayback/available?url=http%3A%2F%2Fexample.org%2F", None, ), ( "POST", "https://web.archive.org/save/", "url=http%3A%2F%2Fexample.org%2F", ), ("GET", "https://web.archive.org/save/status/spn2-abcde", None), ("GET", "https://web.archive.org/save/status/spn2-abcde", None), ("GET", "https://web.archive.org/save/status/spn2-abcde", None), ( "GET", "https://web.archive.org/web/20220922000000id_/" # ditto "http://example.org/", None, ), ] else: assert [ (r.method, r.url) for r in configured_requests_mocker.request_history ] == [ ( "GET", "https://archive.org/wayback/available?url=http%3A%2F%2Fexample.org%2F", ), ( "GET", "https://web.archive.org/save/http://example.org/", ), ( "GET", "https://web.archive.org/web/20220922000000id_/" # ditto "http://example.org/", ), ] def test_get__expired_cache__expired_wb( configured_requests_mocker, opdb_db: Db, authenticated: bool, mock_time ): """ Tests getting a snapshot that is expired in the local cache, and expired in the Wayback Machine -> uses Save Page Now """ snapshoted_at = SNAPSHOT_DATE - datetime.timedelta(days=2) retrieved_at = SNAPSHOT_DATE + datetime.timedelta(days=1) snapshot = models.WebPageSnapshot( url="http://example.org/", snapshot_date=snapshoted_at, snapshot_url=None, retrieved_at=retrieved_at, retrieved_by="localhost", response_headers={"Content-Length": "16"}, content=b"Old page content", ) opdb_db.add_web_page_snapshots([snapshot]) # Reuse the other test; web_cache.Session should simply ignore the outdated snapshot test_get__uncached__expired_wb( configured_requests_mocker, opdb_db, authenticated, mock_time ) def test_get__expired_cache__no_wb( configured_requests_mocker, opdb_db: Db, authenticated: bool, mock_time ): """ Tests getting a snapshot that is expired in the local cache, and absent from the Wayback Machine -> uses Save Page Now """ configured_requests_mocker.register_uri( "GET", "https://archive.org/wayback/available?url=http%3A%2F%2Fexample.org%2F", complete_qs=True, json={"url": "http://example.org/", "archived_snapshots": {}}, ) # Reuse the other test; web_cache.Session should treat the absence of a page # exactly the same way as an expired one. test_get__uncached__expired_wb( configured_requests_mocker, opdb_db, authenticated, mock_time )