2022-09-20 19:33:55 +00:00
|
|
|
# This file is part of the Open Parts Database software
|
|
|
|
# Copyright (C) 2022 Valentin Lorentz
|
|
|
|
#
|
|
|
|
# This program is free software: you can redistribute it and/or modify it under the
|
|
|
|
# terms of the GNU Affero General Public License version 3, as published by the
|
|
|
|
# Free Software Foundation.
|
|
|
|
#
|
|
|
|
# This program is distributed in the hope that it will be useful, but WITHOUT ANY
|
|
|
|
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
|
|
|
|
# PARTICULAR PURPOSE. See the GNU Affero General Public License for more details.
|
|
|
|
#
|
|
|
|
# You should have received a copy of the GNU Affero General Public License along with
|
|
|
|
# this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
|
|
|
|
# pylint: disable=redefined-outer-name
|
|
|
|
|
|
|
|
"""
|
|
|
|
Test generic web page retrieval and caching
|
|
|
|
"""
|
|
|
|
|
|
|
|
import datetime
|
|
|
|
import socket
|
|
|
|
|
|
|
|
import pytest
|
|
|
|
import requests_mock
|
|
|
|
|
|
|
|
from opdb.db import Db, models
|
|
|
|
from opdb.web_cache import Session
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture
|
|
|
|
def requests_mocker():
|
|
|
|
"""Fixture wrapper for :mod:`requests_mock`"""
|
|
|
|
with requests_mock.mock() as m:
|
|
|
|
yield m
|
|
|
|
|
|
|
|
|
|
|
|
SNAPSHOT_DATE = datetime.datetime(2022, 9, 20, 1, 49, 34, tzinfo=datetime.timezone.utc)
|
|
|
|
|
|
|
|
|
2022-09-25 07:49:01 +00:00
|
|
|
@pytest.fixture
|
|
|
|
def mock_time(mocker):
|
|
|
|
"""
|
|
|
|
Makes time.sleep return immediately, and returns a callable that returns the
|
|
|
|
total offset introduced by time.sleep calls.
|
|
|
|
"""
|
|
|
|
offset = 0
|
|
|
|
|
|
|
|
def sleep(seconds):
|
|
|
|
nonlocal offset
|
|
|
|
offset += seconds
|
|
|
|
|
|
|
|
mocker.patch("time.sleep", side_effect=sleep)
|
|
|
|
|
|
|
|
def get_offset():
|
|
|
|
return offset
|
|
|
|
|
|
|
|
return get_offset
|
|
|
|
|
|
|
|
|
2022-09-20 19:33:55 +00:00
|
|
|
@pytest.fixture
|
|
|
|
def configured_requests_mocker(requests_mocker):
|
|
|
|
"""Extension of :func:`requests_mocker` that registers a bunch of URLs of
|
|
|
|
the Wayback Machine API."""
|
|
|
|
requests_mocker.register_uri(
|
|
|
|
"GET",
|
|
|
|
"https://archive.org/wayback/available?url=http%3A%2F%2Fexample.org%2F",
|
|
|
|
complete_qs=True,
|
|
|
|
json={
|
|
|
|
"url": "http://example.org/",
|
|
|
|
"archived_snapshots": {
|
|
|
|
"closest": {
|
|
|
|
"status": "200",
|
|
|
|
"available": True,
|
|
|
|
"url": (
|
|
|
|
"http://web.archive.org/web/20220920014934/"
|
|
|
|
"http://john.smith@example.org/"
|
|
|
|
),
|
|
|
|
"timestamp": "20220920014934",
|
|
|
|
}
|
|
|
|
},
|
|
|
|
},
|
|
|
|
)
|
|
|
|
|
|
|
|
requests_mocker.register_uri(
|
|
|
|
"GET",
|
|
|
|
"http://web.archive.org/web/20220920014934id_/http://john.smith@example.org/",
|
|
|
|
complete_qs=True,
|
|
|
|
text="Example page content from Wayback Machine",
|
|
|
|
)
|
|
|
|
|
|
|
|
requests_mocker.register_uri(
|
|
|
|
"GET",
|
|
|
|
"https://web.archive.org/web/20220920164222id_/http://example.org/",
|
|
|
|
complete_qs=True,
|
|
|
|
text="Example page content from Wayback Machine after Save Page Now",
|
|
|
|
)
|
2022-09-25 07:49:01 +00:00
|
|
|
requests_mocker.register_uri(
|
|
|
|
"GET",
|
|
|
|
"https://web.archive.org/web/20220922000000id_/http://example.org/",
|
|
|
|
complete_qs=True,
|
|
|
|
text="Example page content from Wayback Machine after Save Page Now",
|
|
|
|
)
|
2022-09-20 19:33:55 +00:00
|
|
|
|
2022-09-25 07:49:01 +00:00
|
|
|
# Anonymous SPN:
|
2022-09-20 19:33:55 +00:00
|
|
|
requests_mocker.register_uri(
|
|
|
|
"GET",
|
|
|
|
"https://web.archive.org/save/http://example.org/",
|
|
|
|
complete_qs=True,
|
|
|
|
headers={
|
2022-09-25 07:49:01 +00:00
|
|
|
"location": "https://web.archive.org/web/20220922000000/http://example.org/"
|
2022-09-20 19:33:55 +00:00
|
|
|
},
|
|
|
|
text="""
|
|
|
|
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
|
|
|
|
<title>Redirecting...</title>
|
|
|
|
<h1>Redirecting...</h1>
|
|
|
|
""",
|
|
|
|
)
|
|
|
|
|
2022-09-25 07:49:01 +00:00
|
|
|
# Authenticated SPN:
|
|
|
|
requests_mocker.register_uri(
|
|
|
|
"POST",
|
|
|
|
"https://web.archive.org/save/",
|
|
|
|
complete_qs=True,
|
|
|
|
text='{"url": "http://example.org/", "job_id": "spn2-abcde"}',
|
|
|
|
)
|
|
|
|
requests_mocker.register_uri(
|
|
|
|
"GET",
|
|
|
|
"https://web.archive.org/save/status/spn2-abcde",
|
|
|
|
[
|
|
|
|
dict(
|
|
|
|
text="""
|
|
|
|
{"job_id":"spn2-abcde","resources":["blah"],"status":"pending"}
|
|
|
|
"""
|
|
|
|
),
|
|
|
|
dict(
|
|
|
|
text="""
|
|
|
|
{"job_id":"spn2-abcde","resources":["blah"],"status":"pending"}
|
|
|
|
"""
|
|
|
|
),
|
|
|
|
dict(
|
|
|
|
text="""
|
|
|
|
{"job_id":"spn2-abcde","resources":["blah"],"status":"success",
|
|
|
|
"timestamp":"20220922000000"}
|
|
|
|
"""
|
|
|
|
),
|
|
|
|
],
|
|
|
|
complete_qs=True,
|
|
|
|
)
|
|
|
|
|
2022-09-20 19:33:55 +00:00
|
|
|
yield requests_mocker
|
|
|
|
|
|
|
|
|
|
|
|
def test_get__cached(configured_requests_mocker, opdb_db: Db):
|
|
|
|
"""
|
|
|
|
Tests getting a snapshot that is already in the local cache
|
|
|
|
"""
|
|
|
|
after_date = SNAPSHOT_DATE - datetime.timedelta(days=1)
|
|
|
|
s = Session(opdb_db, after_date)
|
|
|
|
|
|
|
|
retrieved_at = datetime.datetime(2022, 2, 1, 0, 0, 0, tzinfo=datetime.timezone.utc)
|
|
|
|
|
|
|
|
snapshot = models.WebPageSnapshot(
|
|
|
|
url="http://example.org/",
|
|
|
|
snapshot_date=datetime.datetime.now(tz=datetime.timezone.utc),
|
|
|
|
snapshot_url=None,
|
|
|
|
retrieved_at=retrieved_at,
|
|
|
|
retrieved_by="localhost",
|
|
|
|
response_headers={"Content-Length": "19"},
|
|
|
|
content=b"Example page content",
|
|
|
|
)
|
|
|
|
|
|
|
|
opdb_db.add_web_page_snapshots([snapshot])
|
|
|
|
|
|
|
|
assert s.get_or_fetch_snapshot("http://example.org/") == snapshot
|
|
|
|
|
|
|
|
assert [(r.method, r.url) for r in configured_requests_mocker.request_history] == []
|
|
|
|
|
|
|
|
|
|
|
|
def test_get__uncached__recent_wb(configured_requests_mocker, opdb_db: Db):
|
|
|
|
"""
|
|
|
|
Tests getting a snapshot that is not in the local cache, but is available in
|
|
|
|
the Wayback Machine
|
|
|
|
"""
|
|
|
|
after_date = SNAPSHOT_DATE - datetime.timedelta(days=1)
|
|
|
|
s = Session(opdb_db, after_date)
|
|
|
|
|
|
|
|
dt_before = datetime.datetime.now(tz=datetime.timezone.utc)
|
|
|
|
snapshot = s.get_or_fetch_snapshot("http://example.org/")
|
|
|
|
dt_after = datetime.datetime.now(tz=datetime.timezone.utc)
|
|
|
|
|
2022-09-25 07:49:01 +00:00
|
|
|
assert snapshot is not None
|
2022-09-20 19:33:55 +00:00
|
|
|
assert dt_before <= snapshot.retrieved_at <= dt_after
|
|
|
|
|
|
|
|
assert snapshot == models.WebPageSnapshot(
|
|
|
|
url="http://example.org/",
|
|
|
|
snapshot_date=datetime.datetime(
|
|
|
|
2022, 9, 20, 1, 49, 34, tzinfo=datetime.timezone.utc
|
|
|
|
),
|
|
|
|
snapshot_url=(
|
|
|
|
"http://web.archive.org/web/20220920014934id_/"
|
|
|
|
"http://john.smith@example.org/"
|
|
|
|
),
|
|
|
|
retrieved_at=snapshot.retrieved_at,
|
|
|
|
retrieved_by=socket.getfqdn(),
|
|
|
|
response_headers={},
|
|
|
|
content=b"Example page content from Wayback Machine",
|
|
|
|
)
|
|
|
|
|
2022-09-24 20:43:34 +00:00
|
|
|
assert snapshot == opdb_db.get_last_web_page_snapshot("http://example.org/")
|
|
|
|
|
2022-09-20 19:33:55 +00:00
|
|
|
assert [(r.method, r.url) for r in configured_requests_mocker.request_history] == [
|
|
|
|
(
|
|
|
|
"GET",
|
|
|
|
"https://archive.org/wayback/available?url=http%3A%2F%2Fexample.org%2F",
|
|
|
|
),
|
|
|
|
(
|
|
|
|
"GET",
|
|
|
|
"http://web.archive.org/web/20220920014934id_/"
|
|
|
|
"http://john.smith@example.org/",
|
|
|
|
),
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
def test_get__expired_cache__recent_wb(configured_requests_mocker, opdb_db: Db):
|
|
|
|
"""
|
|
|
|
Tests getting a snapshot that is expired in the local cache, but is available in
|
|
|
|
the Wayback Machine
|
|
|
|
"""
|
|
|
|
snapshoted_at = SNAPSHOT_DATE - datetime.timedelta(days=2)
|
|
|
|
retrieved_at = SNAPSHOT_DATE + datetime.timedelta(days=1)
|
|
|
|
|
|
|
|
snapshot = models.WebPageSnapshot(
|
|
|
|
url="http://example.org/",
|
|
|
|
snapshot_date=snapshoted_at,
|
|
|
|
snapshot_url=None,
|
|
|
|
retrieved_at=retrieved_at,
|
|
|
|
retrieved_by="localhost",
|
|
|
|
response_headers={"Content-Length": "16"},
|
|
|
|
content=b"Old page content",
|
|
|
|
)
|
|
|
|
|
|
|
|
opdb_db.add_web_page_snapshots([snapshot])
|
|
|
|
|
|
|
|
# Reuse the other test; web_cache.Session should simply ignore the outdated snapshot
|
|
|
|
test_get__uncached__recent_wb(configured_requests_mocker, opdb_db)
|
|
|
|
|
|
|
|
|
2022-09-25 07:49:01 +00:00
|
|
|
@pytest.fixture(
|
|
|
|
params=[
|
|
|
|
pytest.param(False, id="anonymous"),
|
|
|
|
pytest.param(True, id="authenticated"),
|
|
|
|
],
|
|
|
|
)
|
|
|
|
def authenticated(request) -> bool:
|
|
|
|
"""Parametrized by False/True"""
|
|
|
|
return request.param
|
|
|
|
|
|
|
|
|
|
|
|
def test_get__uncached__expired_wb(
|
|
|
|
configured_requests_mocker, opdb_db: Db, authenticated: bool, mock_time
|
|
|
|
):
|
2022-09-20 19:33:55 +00:00
|
|
|
"""
|
|
|
|
Tests getting a snapshot that is not in the local cache, and expired in
|
|
|
|
the Wayback Machine -> uses Save Page Now
|
|
|
|
"""
|
|
|
|
after_date = SNAPSHOT_DATE + datetime.timedelta(days=1)
|
2022-09-25 07:49:01 +00:00
|
|
|
if authenticated:
|
|
|
|
s = Session(opdb_db, after_date, ias3_auth="akey:skey")
|
|
|
|
else:
|
|
|
|
s = Session(opdb_db, after_date)
|
|
|
|
|
|
|
|
assert mock_time() == 0
|
2022-09-20 19:33:55 +00:00
|
|
|
|
|
|
|
dt_before = datetime.datetime.now(tz=datetime.timezone.utc)
|
|
|
|
snapshot = s.get_or_fetch_snapshot("http://example.org/")
|
|
|
|
dt_after = datetime.datetime.now(tz=datetime.timezone.utc)
|
|
|
|
|
2022-09-25 07:49:01 +00:00
|
|
|
if authenticated:
|
|
|
|
assert mock_time() == 15 # three time.sleep(5) calls
|
|
|
|
else:
|
|
|
|
assert mock_time() == 0 # no sleep() because blocking on SPN GET request
|
|
|
|
|
|
|
|
assert snapshot is not None
|
2022-09-20 19:33:55 +00:00
|
|
|
assert dt_before <= snapshot.retrieved_at <= dt_after
|
|
|
|
|
|
|
|
assert snapshot == models.WebPageSnapshot(
|
|
|
|
url="http://example.org/",
|
|
|
|
snapshot_date=datetime.datetime(
|
2022-09-25 07:49:01 +00:00
|
|
|
2022, 9, 22, 0, 0, 0, tzinfo=datetime.timezone.utc
|
2022-09-20 19:33:55 +00:00
|
|
|
),
|
|
|
|
snapshot_url=(
|
2022-09-25 07:49:01 +00:00
|
|
|
"https://web.archive.org/web/20220922000000id_/" # SPN returns HTTPS URLs
|
2022-09-20 19:33:55 +00:00
|
|
|
"http://example.org/"
|
|
|
|
),
|
|
|
|
retrieved_at=snapshot.retrieved_at,
|
|
|
|
retrieved_by=socket.getfqdn(),
|
|
|
|
response_headers={},
|
|
|
|
content=b"Example page content from Wayback Machine after Save Page Now",
|
|
|
|
)
|
|
|
|
|
2022-09-24 20:43:34 +00:00
|
|
|
assert snapshot == opdb_db.get_last_web_page_snapshot("http://example.org/")
|
|
|
|
|
2022-09-25 07:49:01 +00:00
|
|
|
if authenticated:
|
|
|
|
assert [
|
|
|
|
(r.method, r.url, r.body)
|
|
|
|
for r in configured_requests_mocker.request_history
|
|
|
|
] == [
|
|
|
|
(
|
|
|
|
"GET",
|
|
|
|
"https://archive.org/wayback/available?url=http%3A%2F%2Fexample.org%2F",
|
|
|
|
None,
|
|
|
|
),
|
|
|
|
(
|
|
|
|
"POST",
|
|
|
|
"https://web.archive.org/save/",
|
|
|
|
"url=http%3A%2F%2Fexample.org%2F",
|
|
|
|
),
|
|
|
|
("GET", "https://web.archive.org/save/status/spn2-abcde", None),
|
|
|
|
("GET", "https://web.archive.org/save/status/spn2-abcde", None),
|
|
|
|
("GET", "https://web.archive.org/save/status/spn2-abcde", None),
|
|
|
|
(
|
|
|
|
"GET",
|
|
|
|
"https://web.archive.org/web/20220922000000id_/" # ditto
|
|
|
|
"http://example.org/",
|
|
|
|
None,
|
|
|
|
),
|
|
|
|
]
|
|
|
|
else:
|
|
|
|
assert [
|
|
|
|
(r.method, r.url) for r in configured_requests_mocker.request_history
|
|
|
|
] == [
|
|
|
|
(
|
|
|
|
"GET",
|
|
|
|
"https://archive.org/wayback/available?url=http%3A%2F%2Fexample.org%2F",
|
|
|
|
),
|
|
|
|
(
|
|
|
|
"GET",
|
|
|
|
"https://web.archive.org/save/http://example.org/",
|
|
|
|
),
|
|
|
|
(
|
|
|
|
"GET",
|
|
|
|
"https://web.archive.org/web/20220922000000id_/" # ditto
|
|
|
|
"http://example.org/",
|
|
|
|
),
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
def test_get__expired_cache__expired_wb(
|
|
|
|
configured_requests_mocker, opdb_db: Db, authenticated: bool, mock_time
|
|
|
|
):
|
2022-09-20 19:33:55 +00:00
|
|
|
"""
|
|
|
|
Tests getting a snapshot that is expired in the local cache, and expired in
|
|
|
|
the Wayback Machine -> uses Save Page Now
|
|
|
|
"""
|
|
|
|
snapshoted_at = SNAPSHOT_DATE - datetime.timedelta(days=2)
|
|
|
|
retrieved_at = SNAPSHOT_DATE + datetime.timedelta(days=1)
|
|
|
|
|
|
|
|
snapshot = models.WebPageSnapshot(
|
|
|
|
url="http://example.org/",
|
|
|
|
snapshot_date=snapshoted_at,
|
|
|
|
snapshot_url=None,
|
|
|
|
retrieved_at=retrieved_at,
|
|
|
|
retrieved_by="localhost",
|
|
|
|
response_headers={"Content-Length": "16"},
|
|
|
|
content=b"Old page content",
|
|
|
|
)
|
|
|
|
|
|
|
|
opdb_db.add_web_page_snapshots([snapshot])
|
|
|
|
|
|
|
|
# Reuse the other test; web_cache.Session should simply ignore the outdated snapshot
|
2022-09-25 07:49:01 +00:00
|
|
|
test_get__uncached__expired_wb(
|
|
|
|
configured_requests_mocker, opdb_db, authenticated, mock_time
|
|
|
|
)
|
2022-09-20 19:33:55 +00:00
|
|
|
|
|
|
|
|
2022-09-25 07:49:01 +00:00
|
|
|
def test_get__expired_cache__no_wb(
|
|
|
|
configured_requests_mocker, opdb_db: Db, authenticated: bool, mock_time
|
|
|
|
):
|
2022-09-20 19:33:55 +00:00
|
|
|
"""
|
|
|
|
Tests getting a snapshot that is expired in the local cache, and absent from
|
|
|
|
the Wayback Machine -> uses Save Page Now
|
|
|
|
"""
|
|
|
|
configured_requests_mocker.register_uri(
|
|
|
|
"GET",
|
|
|
|
"https://archive.org/wayback/available?url=http%3A%2F%2Fexample.org%2F",
|
|
|
|
complete_qs=True,
|
|
|
|
json={"url": "http://example.org/", "archived_snapshots": {}},
|
|
|
|
)
|
|
|
|
|
|
|
|
# Reuse the other test; web_cache.Session should treat the absence of a page
|
|
|
|
# exactly the same way as an expired one.
|
2022-09-25 07:49:01 +00:00
|
|
|
test_get__uncached__expired_wb(
|
|
|
|
configured_requests_mocker, opdb_db, authenticated, mock_time
|
|
|
|
)
|