Add optional support for the authenticated SPN API
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful

It has much higher rate limits.
This commit is contained in:
Val Lorentz 2022-09-25 09:49:01 +02:00
parent 1db60954d6
commit 1eb169ee6c
5 changed files with 241 additions and 34 deletions

View File

@ -4,7 +4,7 @@ pipeline:
image: python:3.9 image: python:3.9
commands: commands:
- pip3 install flake8 - pip3 install flake8
- make flake8 - flake8 opdb/
pylint: pylint:
group: lint group: lint
image: cytopia/pylint image: cytopia/pylint

View File

@ -10,7 +10,7 @@ black-check:
black: black:
black opdb/ black opdb/
flake8: flake8: black
flake8 opdb/ flake8 opdb/
pylint: pylint:

View File

@ -22,6 +22,7 @@ citation, and caches them in the local database for quick access by other worker
import datetime import datetime
import re import re
import socket import socket
import time
import typing import typing
import pkg_resources import pkg_resources
@ -57,11 +58,14 @@ class Session:
archived in the Internet Archive. archived in the Internet Archive.
""" """
def __init__(self, db: Db, min_snapshot_date: datetime.datetime): def __init__(
self, db: Db, min_snapshot_date: datetime.datetime, ias3_auth: str = None
):
self.min_snapshot_date = min_snapshot_date self.min_snapshot_date = min_snapshot_date
self._db = db self._db = db
self._session = requests.Session() self._session = requests.Session()
self._session.headers["User-Agent"] = USER_AGENT self._session.headers["User-Agent"] = USER_AGENT
self._ias3_auth = ias3_auth
def _fetch_newest_wayback_snapshot( def _fetch_newest_wayback_snapshot(
self, url: str self, url: str
@ -122,12 +126,88 @@ class Session:
content=response.content, content=response.content,
) )
def _save_page_now(self, url: str) -> models.WebPageSnapshot: def _save_page_now(self, url: str) -> typing.Optional[models.WebPageSnapshot]:
response = self._session.get( if self._ias3_auth is None:
f"https://web.archive.org/save/{url}", allow_redirects=False return self._anonymous_save_page_now(url)
else:
return self._authenticated_save_page_now(url, self._ias3_auth)
def _anonymous_save_page_now(
self, url: str
) -> typing.Optional[models.WebPageSnapshot]:
while True:
try:
response = self._session.get(
f"https://web.archive.org/save/{url}",
allow_redirects=False,
)
response.raise_for_status()
except requests.exceptions.HTTPError as e:
if e.response.status_code == 429:
print(e)
print("Sleeping...")
time.sleep(10)
continue
elif e.response.status_code == 520:
# "Job failed". We will try again in the next workflow run.
return None
else:
raise
else:
wayback_url = response.headers["Location"]
return self._fetch_wayback_snapshot(url, wayback_url)
def _authenticated_save_page_now(
self, url: str, ias3_auth: str
) -> typing.Optional[models.WebPageSnapshot]:
for _ in range(3):
response = self._session.post(
"https://web.archive.org/save/",
allow_redirects=False,
data={"url": url},
headers={
"Accept": "application/json",
"Authorization": f"LOW {ias3_auth}",
},
)
response.raise_for_status()
job_id = response.json()["job_id"]
status = "pending"
while status == "pending":
time.sleep(5)
response = self._session.get(
f"https://web.archive.org/save/status/{job_id}"
)
response.raise_for_status()
status = response.json()["status"]
if status == "success":
break
if response.json()["status_ext"] == "error:service-unavailable":
print(response.json()["message"])
time.sleep(10)
continue # retry
elif response.json()["status_ext"] == "error:too-many-daily-captures":
# typically happens when a page repeatedly fails so we
# (unsuccessfully) tried to capture it too many times
return None
assert False, response.json()
else:
print("Too many failures; giving up.")
return None
ia_timestamp = response.json()["timestamp"]
snapshot_date = _datetime_from_ia_timestamp(ia_timestamp)
assert snapshot_date >= self.min_snapshot_date, (
snapshot_date.isoformat(),
self.min_snapshot_date.isoformat(),
) )
response.raise_for_status() # TODO: retry
wayback_url = response.headers["Location"] wayback_url = f"https://web.archive.org/web/{ia_timestamp}/{url}"
return self._fetch_wayback_snapshot(url, wayback_url) return self._fetch_wayback_snapshot(url, wayback_url)
def _get_cached_snapshot(self, url: str) -> typing.Optional[models.WebPageSnapshot]: def _get_cached_snapshot(self, url: str) -> typing.Optional[models.WebPageSnapshot]:
@ -137,7 +217,9 @@ class Session:
return snapshot return snapshot
def get_or_fetch_snapshot(self, url: str) -> models.WebPageSnapshot: def get_or_fetch_snapshot(
self, url: str
) -> typing.Optional[models.WebPageSnapshot]:
""" """
Fetches the given URL from the local cache or from the Wayback Machine. Fetches the given URL from the local cache or from the Wayback Machine.
@ -158,5 +240,6 @@ class Session:
# If the Internet Archive does not have it yet, trigger its Save Code Now, # If the Internet Archive does not have it yet, trigger its Save Code Now,
# and query the Wayback Machine again # and query the Wayback Machine again
snapshot = self._save_page_now(url) snapshot = self._save_page_now(url)
self._db.add_web_page_snapshots([snapshot]) if snapshot is not None:
self._db.add_web_page_snapshots([snapshot])
return snapshot return snapshot

View File

@ -38,6 +38,26 @@ def requests_mocker():
SNAPSHOT_DATE = datetime.datetime(2022, 9, 20, 1, 49, 34, tzinfo=datetime.timezone.utc) SNAPSHOT_DATE = datetime.datetime(2022, 9, 20, 1, 49, 34, tzinfo=datetime.timezone.utc)
@pytest.fixture
def mock_time(mocker):
"""
Makes time.sleep return immediately, and returns a callable that returns the
total offset introduced by time.sleep calls.
"""
offset = 0
def sleep(seconds):
nonlocal offset
offset += seconds
mocker.patch("time.sleep", side_effect=sleep)
def get_offset():
return offset
return get_offset
@pytest.fixture @pytest.fixture
def configured_requests_mocker(requests_mocker): def configured_requests_mocker(requests_mocker):
"""Extension of :func:`requests_mocker` that registers a bunch of URLs of """Extension of :func:`requests_mocker` that registers a bunch of URLs of
@ -75,13 +95,20 @@ def configured_requests_mocker(requests_mocker):
complete_qs=True, complete_qs=True,
text="Example page content from Wayback Machine after Save Page Now", text="Example page content from Wayback Machine after Save Page Now",
) )
requests_mocker.register_uri(
"GET",
"https://web.archive.org/web/20220922000000id_/http://example.org/",
complete_qs=True,
text="Example page content from Wayback Machine after Save Page Now",
)
# Anonymous SPN:
requests_mocker.register_uri( requests_mocker.register_uri(
"GET", "GET",
"https://web.archive.org/save/http://example.org/", "https://web.archive.org/save/http://example.org/",
complete_qs=True, complete_qs=True,
headers={ headers={
"location": "https://web.archive.org/web/20220920164222/http://example.org/" "location": "https://web.archive.org/web/20220922000000/http://example.org/"
}, },
text=""" text="""
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN"> <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
@ -90,6 +117,37 @@ def configured_requests_mocker(requests_mocker):
""", """,
) )
# Authenticated SPN:
requests_mocker.register_uri(
"POST",
"https://web.archive.org/save/",
complete_qs=True,
text='{"url": "http://example.org/", "job_id": "spn2-abcde"}',
)
requests_mocker.register_uri(
"GET",
"https://web.archive.org/save/status/spn2-abcde",
[
dict(
text="""
{"job_id":"spn2-abcde","resources":["blah"],"status":"pending"}
"""
),
dict(
text="""
{"job_id":"spn2-abcde","resources":["blah"],"status":"pending"}
"""
),
dict(
text="""
{"job_id":"spn2-abcde","resources":["blah"],"status":"success",
"timestamp":"20220922000000"}
"""
),
],
complete_qs=True,
)
yield requests_mocker yield requests_mocker
@ -131,6 +189,7 @@ def test_get__uncached__recent_wb(configured_requests_mocker, opdb_db: Db):
snapshot = s.get_or_fetch_snapshot("http://example.org/") snapshot = s.get_or_fetch_snapshot("http://example.org/")
dt_after = datetime.datetime.now(tz=datetime.timezone.utc) dt_after = datetime.datetime.now(tz=datetime.timezone.utc)
assert snapshot is not None
assert dt_before <= snapshot.retrieved_at <= dt_after assert dt_before <= snapshot.retrieved_at <= dt_after
assert snapshot == models.WebPageSnapshot( assert snapshot == models.WebPageSnapshot(
@ -187,27 +246,51 @@ def test_get__expired_cache__recent_wb(configured_requests_mocker, opdb_db: Db):
test_get__uncached__recent_wb(configured_requests_mocker, opdb_db) test_get__uncached__recent_wb(configured_requests_mocker, opdb_db)
def test_get__uncached__expired_wb(configured_requests_mocker, opdb_db: Db): @pytest.fixture(
params=[
pytest.param(False, id="anonymous"),
pytest.param(True, id="authenticated"),
],
)
def authenticated(request) -> bool:
"""Parametrized by False/True"""
return request.param
def test_get__uncached__expired_wb(
configured_requests_mocker, opdb_db: Db, authenticated: bool, mock_time
):
""" """
Tests getting a snapshot that is not in the local cache, and expired in Tests getting a snapshot that is not in the local cache, and expired in
the Wayback Machine -> uses Save Page Now the Wayback Machine -> uses Save Page Now
""" """
after_date = SNAPSHOT_DATE + datetime.timedelta(days=1) after_date = SNAPSHOT_DATE + datetime.timedelta(days=1)
s = Session(opdb_db, after_date) if authenticated:
s = Session(opdb_db, after_date, ias3_auth="akey:skey")
else:
s = Session(opdb_db, after_date)
assert mock_time() == 0
dt_before = datetime.datetime.now(tz=datetime.timezone.utc) dt_before = datetime.datetime.now(tz=datetime.timezone.utc)
snapshot = s.get_or_fetch_snapshot("http://example.org/") snapshot = s.get_or_fetch_snapshot("http://example.org/")
dt_after = datetime.datetime.now(tz=datetime.timezone.utc) dt_after = datetime.datetime.now(tz=datetime.timezone.utc)
if authenticated:
assert mock_time() == 15 # three time.sleep(5) calls
else:
assert mock_time() == 0 # no sleep() because blocking on SPN GET request
assert snapshot is not None
assert dt_before <= snapshot.retrieved_at <= dt_after assert dt_before <= snapshot.retrieved_at <= dt_after
assert snapshot == models.WebPageSnapshot( assert snapshot == models.WebPageSnapshot(
url="http://example.org/", url="http://example.org/",
snapshot_date=datetime.datetime( snapshot_date=datetime.datetime(
2022, 9, 20, 16, 42, 22, tzinfo=datetime.timezone.utc 2022, 9, 22, 0, 0, 0, tzinfo=datetime.timezone.utc
), ),
snapshot_url=( snapshot_url=(
"https://web.archive.org/web/20220920164222id_/" # SPN returns HTTPS URLs "https://web.archive.org/web/20220922000000id_/" # SPN returns HTTPS URLs
"http://example.org/" "http://example.org/"
), ),
retrieved_at=snapshot.retrieved_at, retrieved_at=snapshot.retrieved_at,
@ -218,24 +301,54 @@ def test_get__uncached__expired_wb(configured_requests_mocker, opdb_db: Db):
assert snapshot == opdb_db.get_last_web_page_snapshot("http://example.org/") assert snapshot == opdb_db.get_last_web_page_snapshot("http://example.org/")
assert [(r.method, r.url) for r in configured_requests_mocker.request_history] == [ if authenticated:
( assert [
"GET", (r.method, r.url, r.body)
"https://archive.org/wayback/available?url=http%3A%2F%2Fexample.org%2F", for r in configured_requests_mocker.request_history
), ] == [
( (
"GET", "GET",
"https://web.archive.org/save/http://example.org/", "https://archive.org/wayback/available?url=http%3A%2F%2Fexample.org%2F",
), None,
( ),
"GET", (
"https://web.archive.org/web/20220920164222id_/" # ditto "POST",
"http://example.org/", "https://web.archive.org/save/",
), "url=http%3A%2F%2Fexample.org%2F",
] ),
("GET", "https://web.archive.org/save/status/spn2-abcde", None),
("GET", "https://web.archive.org/save/status/spn2-abcde", None),
("GET", "https://web.archive.org/save/status/spn2-abcde", None),
(
"GET",
"https://web.archive.org/web/20220922000000id_/" # ditto
"http://example.org/",
None,
),
]
else:
assert [
(r.method, r.url) for r in configured_requests_mocker.request_history
] == [
(
"GET",
"https://archive.org/wayback/available?url=http%3A%2F%2Fexample.org%2F",
),
(
"GET",
"https://web.archive.org/save/http://example.org/",
),
(
"GET",
"https://web.archive.org/web/20220922000000id_/" # ditto
"http://example.org/",
),
]
def test_get__expired_cache__expired_wb(configured_requests_mocker, opdb_db: Db): def test_get__expired_cache__expired_wb(
configured_requests_mocker, opdb_db: Db, authenticated: bool, mock_time
):
""" """
Tests getting a snapshot that is expired in the local cache, and expired in Tests getting a snapshot that is expired in the local cache, and expired in
the Wayback Machine -> uses Save Page Now the Wayback Machine -> uses Save Page Now
@ -256,10 +369,14 @@ def test_get__expired_cache__expired_wb(configured_requests_mocker, opdb_db: Db)
opdb_db.add_web_page_snapshots([snapshot]) opdb_db.add_web_page_snapshots([snapshot])
# Reuse the other test; web_cache.Session should simply ignore the outdated snapshot # Reuse the other test; web_cache.Session should simply ignore the outdated snapshot
test_get__uncached__expired_wb(configured_requests_mocker, opdb_db) test_get__uncached__expired_wb(
configured_requests_mocker, opdb_db, authenticated, mock_time
)
def test_get__expired_cache__no_wb(configured_requests_mocker, opdb_db: Db): def test_get__expired_cache__no_wb(
configured_requests_mocker, opdb_db: Db, authenticated: bool, mock_time
):
""" """
Tests getting a snapshot that is expired in the local cache, and absent from Tests getting a snapshot that is expired in the local cache, and absent from
the Wayback Machine -> uses Save Page Now the Wayback Machine -> uses Save Page Now
@ -273,4 +390,6 @@ def test_get__expired_cache__no_wb(configured_requests_mocker, opdb_db: Db):
# Reuse the other test; web_cache.Session should treat the absence of a page # Reuse the other test; web_cache.Session should treat the absence of a page
# exactly the same way as an expired one. # exactly the same way as an expired one.
test_get__uncached__expired_wb(configured_requests_mocker, opdb_db) test_get__uncached__expired_wb(
configured_requests_mocker, opdb_db, authenticated, mock_time
)

View File

@ -15,6 +15,7 @@ dependencies = [
[project.optional-dependencies] [project.optional-dependencies]
testing = [ testing = [
"pytest", "pytest",
"pytest-mock",
"pytest-postgresql", "pytest-postgresql",
"requests-mock", "requests-mock",
"types-requests", "types-requests",
@ -41,8 +42,12 @@ disable = [
"fixme", "fixme",
"invalid-name", "invalid-name",
"no-else-return", "no-else-return",
"no-else-continue",
"too-few-public-methods", "too-few-public-methods",
"too-many-instance-attributes", "too-many-instance-attributes",
# false positives:
"unreachable",
"assignment-from-no-return",
# mypy does it better: # mypy does it better:
"no-member", "no-member",
"import-error", "import-error",