Add optional support for the authenticated SPN API
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful

It has much higher rate limits.
This commit is contained in:
Val Lorentz 2022-09-25 09:49:01 +02:00
parent 1db60954d6
commit 1eb169ee6c
5 changed files with 241 additions and 34 deletions

View File

@ -4,7 +4,7 @@ pipeline:
image: python:3.9
commands:
- pip3 install flake8
- make flake8
- flake8 opdb/
pylint:
group: lint
image: cytopia/pylint

View File

@ -10,7 +10,7 @@ black-check:
black:
black opdb/
flake8:
flake8: black
flake8 opdb/
pylint:

View File

@ -22,6 +22,7 @@ citation, and caches them in the local database for quick access by other worker
import datetime
import re
import socket
import time
import typing
import pkg_resources
@ -57,11 +58,14 @@ class Session:
archived in the Internet Archive.
"""
def __init__(self, db: Db, min_snapshot_date: datetime.datetime):
def __init__(
self, db: Db, min_snapshot_date: datetime.datetime, ias3_auth: str = None
):
self.min_snapshot_date = min_snapshot_date
self._db = db
self._session = requests.Session()
self._session.headers["User-Agent"] = USER_AGENT
self._ias3_auth = ias3_auth
def _fetch_newest_wayback_snapshot(
self, url: str
@ -122,12 +126,88 @@ class Session:
content=response.content,
)
def _save_page_now(self, url: str) -> models.WebPageSnapshot:
response = self._session.get(
f"https://web.archive.org/save/{url}", allow_redirects=False
def _save_page_now(self, url: str) -> typing.Optional[models.WebPageSnapshot]:
if self._ias3_auth is None:
return self._anonymous_save_page_now(url)
else:
return self._authenticated_save_page_now(url, self._ias3_auth)
def _anonymous_save_page_now(
self, url: str
) -> typing.Optional[models.WebPageSnapshot]:
while True:
try:
response = self._session.get(
f"https://web.archive.org/save/{url}",
allow_redirects=False,
)
response.raise_for_status()
except requests.exceptions.HTTPError as e:
if e.response.status_code == 429:
print(e)
print("Sleeping...")
time.sleep(10)
continue
elif e.response.status_code == 520:
# "Job failed". We will try again in the next workflow run.
return None
else:
raise
else:
wayback_url = response.headers["Location"]
return self._fetch_wayback_snapshot(url, wayback_url)
def _authenticated_save_page_now(
self, url: str, ias3_auth: str
) -> typing.Optional[models.WebPageSnapshot]:
for _ in range(3):
response = self._session.post(
"https://web.archive.org/save/",
allow_redirects=False,
data={"url": url},
headers={
"Accept": "application/json",
"Authorization": f"LOW {ias3_auth}",
},
)
response.raise_for_status()
job_id = response.json()["job_id"]
status = "pending"
while status == "pending":
time.sleep(5)
response = self._session.get(
f"https://web.archive.org/save/status/{job_id}"
)
response.raise_for_status()
status = response.json()["status"]
if status == "success":
break
if response.json()["status_ext"] == "error:service-unavailable":
print(response.json()["message"])
time.sleep(10)
continue # retry
elif response.json()["status_ext"] == "error:too-many-daily-captures":
# typically happens when a page repeatedly fails so we
# (unsuccessfully) tried to capture it too many times
return None
assert False, response.json()
else:
print("Too many failures; giving up.")
return None
ia_timestamp = response.json()["timestamp"]
snapshot_date = _datetime_from_ia_timestamp(ia_timestamp)
assert snapshot_date >= self.min_snapshot_date, (
snapshot_date.isoformat(),
self.min_snapshot_date.isoformat(),
)
response.raise_for_status() # TODO: retry
wayback_url = response.headers["Location"]
wayback_url = f"https://web.archive.org/web/{ia_timestamp}/{url}"
return self._fetch_wayback_snapshot(url, wayback_url)
def _get_cached_snapshot(self, url: str) -> typing.Optional[models.WebPageSnapshot]:
@ -137,7 +217,9 @@ class Session:
return snapshot
def get_or_fetch_snapshot(self, url: str) -> models.WebPageSnapshot:
def get_or_fetch_snapshot(
self, url: str
) -> typing.Optional[models.WebPageSnapshot]:
"""
Fetches the given URL from the local cache or from the Wayback Machine.
@ -158,5 +240,6 @@ class Session:
# If the Internet Archive does not have it yet, trigger its Save Code Now,
# and query the Wayback Machine again
snapshot = self._save_page_now(url)
self._db.add_web_page_snapshots([snapshot])
if snapshot is not None:
self._db.add_web_page_snapshots([snapshot])
return snapshot

View File

@ -38,6 +38,26 @@ def requests_mocker():
SNAPSHOT_DATE = datetime.datetime(2022, 9, 20, 1, 49, 34, tzinfo=datetime.timezone.utc)
@pytest.fixture
def mock_time(mocker):
"""
Makes time.sleep return immediately, and returns a callable that returns the
total offset introduced by time.sleep calls.
"""
offset = 0
def sleep(seconds):
nonlocal offset
offset += seconds
mocker.patch("time.sleep", side_effect=sleep)
def get_offset():
return offset
return get_offset
@pytest.fixture
def configured_requests_mocker(requests_mocker):
"""Extension of :func:`requests_mocker` that registers a bunch of URLs of
@ -75,13 +95,20 @@ def configured_requests_mocker(requests_mocker):
complete_qs=True,
text="Example page content from Wayback Machine after Save Page Now",
)
requests_mocker.register_uri(
"GET",
"https://web.archive.org/web/20220922000000id_/http://example.org/",
complete_qs=True,
text="Example page content from Wayback Machine after Save Page Now",
)
# Anonymous SPN:
requests_mocker.register_uri(
"GET",
"https://web.archive.org/save/http://example.org/",
complete_qs=True,
headers={
"location": "https://web.archive.org/web/20220920164222/http://example.org/"
"location": "https://web.archive.org/web/20220922000000/http://example.org/"
},
text="""
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
@ -90,6 +117,37 @@ def configured_requests_mocker(requests_mocker):
""",
)
# Authenticated SPN:
requests_mocker.register_uri(
"POST",
"https://web.archive.org/save/",
complete_qs=True,
text='{"url": "http://example.org/", "job_id": "spn2-abcde"}',
)
requests_mocker.register_uri(
"GET",
"https://web.archive.org/save/status/spn2-abcde",
[
dict(
text="""
{"job_id":"spn2-abcde","resources":["blah"],"status":"pending"}
"""
),
dict(
text="""
{"job_id":"spn2-abcde","resources":["blah"],"status":"pending"}
"""
),
dict(
text="""
{"job_id":"spn2-abcde","resources":["blah"],"status":"success",
"timestamp":"20220922000000"}
"""
),
],
complete_qs=True,
)
yield requests_mocker
@ -131,6 +189,7 @@ def test_get__uncached__recent_wb(configured_requests_mocker, opdb_db: Db):
snapshot = s.get_or_fetch_snapshot("http://example.org/")
dt_after = datetime.datetime.now(tz=datetime.timezone.utc)
assert snapshot is not None
assert dt_before <= snapshot.retrieved_at <= dt_after
assert snapshot == models.WebPageSnapshot(
@ -187,27 +246,51 @@ def test_get__expired_cache__recent_wb(configured_requests_mocker, opdb_db: Db):
test_get__uncached__recent_wb(configured_requests_mocker, opdb_db)
def test_get__uncached__expired_wb(configured_requests_mocker, opdb_db: Db):
@pytest.fixture(
params=[
pytest.param(False, id="anonymous"),
pytest.param(True, id="authenticated"),
],
)
def authenticated(request) -> bool:
"""Parametrized by False/True"""
return request.param
def test_get__uncached__expired_wb(
configured_requests_mocker, opdb_db: Db, authenticated: bool, mock_time
):
"""
Tests getting a snapshot that is not in the local cache, and expired in
the Wayback Machine -> uses Save Page Now
"""
after_date = SNAPSHOT_DATE + datetime.timedelta(days=1)
s = Session(opdb_db, after_date)
if authenticated:
s = Session(opdb_db, after_date, ias3_auth="akey:skey")
else:
s = Session(opdb_db, after_date)
assert mock_time() == 0
dt_before = datetime.datetime.now(tz=datetime.timezone.utc)
snapshot = s.get_or_fetch_snapshot("http://example.org/")
dt_after = datetime.datetime.now(tz=datetime.timezone.utc)
if authenticated:
assert mock_time() == 15 # three time.sleep(5) calls
else:
assert mock_time() == 0 # no sleep() because blocking on SPN GET request
assert snapshot is not None
assert dt_before <= snapshot.retrieved_at <= dt_after
assert snapshot == models.WebPageSnapshot(
url="http://example.org/",
snapshot_date=datetime.datetime(
2022, 9, 20, 16, 42, 22, tzinfo=datetime.timezone.utc
2022, 9, 22, 0, 0, 0, tzinfo=datetime.timezone.utc
),
snapshot_url=(
"https://web.archive.org/web/20220920164222id_/" # SPN returns HTTPS URLs
"https://web.archive.org/web/20220922000000id_/" # SPN returns HTTPS URLs
"http://example.org/"
),
retrieved_at=snapshot.retrieved_at,
@ -218,24 +301,54 @@ def test_get__uncached__expired_wb(configured_requests_mocker, opdb_db: Db):
assert snapshot == opdb_db.get_last_web_page_snapshot("http://example.org/")
assert [(r.method, r.url) for r in configured_requests_mocker.request_history] == [
(
"GET",
"https://archive.org/wayback/available?url=http%3A%2F%2Fexample.org%2F",
),
(
"GET",
"https://web.archive.org/save/http://example.org/",
),
(
"GET",
"https://web.archive.org/web/20220920164222id_/" # ditto
"http://example.org/",
),
]
if authenticated:
assert [
(r.method, r.url, r.body)
for r in configured_requests_mocker.request_history
] == [
(
"GET",
"https://archive.org/wayback/available?url=http%3A%2F%2Fexample.org%2F",
None,
),
(
"POST",
"https://web.archive.org/save/",
"url=http%3A%2F%2Fexample.org%2F",
),
("GET", "https://web.archive.org/save/status/spn2-abcde", None),
("GET", "https://web.archive.org/save/status/spn2-abcde", None),
("GET", "https://web.archive.org/save/status/spn2-abcde", None),
(
"GET",
"https://web.archive.org/web/20220922000000id_/" # ditto
"http://example.org/",
None,
),
]
else:
assert [
(r.method, r.url) for r in configured_requests_mocker.request_history
] == [
(
"GET",
"https://archive.org/wayback/available?url=http%3A%2F%2Fexample.org%2F",
),
(
"GET",
"https://web.archive.org/save/http://example.org/",
),
(
"GET",
"https://web.archive.org/web/20220922000000id_/" # ditto
"http://example.org/",
),
]
def test_get__expired_cache__expired_wb(configured_requests_mocker, opdb_db: Db):
def test_get__expired_cache__expired_wb(
configured_requests_mocker, opdb_db: Db, authenticated: bool, mock_time
):
"""
Tests getting a snapshot that is expired in the local cache, and expired in
the Wayback Machine -> uses Save Page Now
@ -256,10 +369,14 @@ def test_get__expired_cache__expired_wb(configured_requests_mocker, opdb_db: Db)
opdb_db.add_web_page_snapshots([snapshot])
# Reuse the other test; web_cache.Session should simply ignore the outdated snapshot
test_get__uncached__expired_wb(configured_requests_mocker, opdb_db)
test_get__uncached__expired_wb(
configured_requests_mocker, opdb_db, authenticated, mock_time
)
def test_get__expired_cache__no_wb(configured_requests_mocker, opdb_db: Db):
def test_get__expired_cache__no_wb(
configured_requests_mocker, opdb_db: Db, authenticated: bool, mock_time
):
"""
Tests getting a snapshot that is expired in the local cache, and absent from
the Wayback Machine -> uses Save Page Now
@ -273,4 +390,6 @@ def test_get__expired_cache__no_wb(configured_requests_mocker, opdb_db: Db):
# Reuse the other test; web_cache.Session should treat the absence of a page
# exactly the same way as an expired one.
test_get__uncached__expired_wb(configured_requests_mocker, opdb_db)
test_get__uncached__expired_wb(
configured_requests_mocker, opdb_db, authenticated, mock_time
)

View File

@ -15,6 +15,7 @@ dependencies = [
[project.optional-dependencies]
testing = [
"pytest",
"pytest-mock",
"pytest-postgresql",
"requests-mock",
"types-requests",
@ -41,8 +42,12 @@ disable = [
"fixme",
"invalid-name",
"no-else-return",
"no-else-continue",
"too-few-public-methods",
"too-many-instance-attributes",
# false positives:
"unreachable",
"assignment-from-no-return",
# mypy does it better:
"no-member",
"import-error",