Add optional support for the authenticated SPN API
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful
It has much higher rate limits.
This commit is contained in:
parent
1db60954d6
commit
1eb169ee6c
|
@ -4,7 +4,7 @@ pipeline:
|
|||
image: python:3.9
|
||||
commands:
|
||||
- pip3 install flake8
|
||||
- make flake8
|
||||
- flake8 opdb/
|
||||
pylint:
|
||||
group: lint
|
||||
image: cytopia/pylint
|
||||
|
|
2
Makefile
2
Makefile
|
@ -10,7 +10,7 @@ black-check:
|
|||
black:
|
||||
black opdb/
|
||||
|
||||
flake8:
|
||||
flake8: black
|
||||
flake8 opdb/
|
||||
|
||||
pylint:
|
||||
|
|
|
@ -22,6 +22,7 @@ citation, and caches them in the local database for quick access by other worker
|
|||
import datetime
|
||||
import re
|
||||
import socket
|
||||
import time
|
||||
import typing
|
||||
|
||||
import pkg_resources
|
||||
|
@ -57,11 +58,14 @@ class Session:
|
|||
archived in the Internet Archive.
|
||||
"""
|
||||
|
||||
def __init__(self, db: Db, min_snapshot_date: datetime.datetime):
|
||||
def __init__(
|
||||
self, db: Db, min_snapshot_date: datetime.datetime, ias3_auth: str = None
|
||||
):
|
||||
self.min_snapshot_date = min_snapshot_date
|
||||
self._db = db
|
||||
self._session = requests.Session()
|
||||
self._session.headers["User-Agent"] = USER_AGENT
|
||||
self._ias3_auth = ias3_auth
|
||||
|
||||
def _fetch_newest_wayback_snapshot(
|
||||
self, url: str
|
||||
|
@ -122,12 +126,88 @@ class Session:
|
|||
content=response.content,
|
||||
)
|
||||
|
||||
def _save_page_now(self, url: str) -> models.WebPageSnapshot:
|
||||
response = self._session.get(
|
||||
f"https://web.archive.org/save/{url}", allow_redirects=False
|
||||
def _save_page_now(self, url: str) -> typing.Optional[models.WebPageSnapshot]:
|
||||
if self._ias3_auth is None:
|
||||
return self._anonymous_save_page_now(url)
|
||||
else:
|
||||
return self._authenticated_save_page_now(url, self._ias3_auth)
|
||||
|
||||
def _anonymous_save_page_now(
|
||||
self, url: str
|
||||
) -> typing.Optional[models.WebPageSnapshot]:
|
||||
while True:
|
||||
try:
|
||||
response = self._session.get(
|
||||
f"https://web.archive.org/save/{url}",
|
||||
allow_redirects=False,
|
||||
)
|
||||
response.raise_for_status()
|
||||
except requests.exceptions.HTTPError as e:
|
||||
if e.response.status_code == 429:
|
||||
print(e)
|
||||
print("Sleeping...")
|
||||
time.sleep(10)
|
||||
continue
|
||||
elif e.response.status_code == 520:
|
||||
# "Job failed". We will try again in the next workflow run.
|
||||
return None
|
||||
else:
|
||||
raise
|
||||
else:
|
||||
wayback_url = response.headers["Location"]
|
||||
return self._fetch_wayback_snapshot(url, wayback_url)
|
||||
|
||||
def _authenticated_save_page_now(
|
||||
self, url: str, ias3_auth: str
|
||||
) -> typing.Optional[models.WebPageSnapshot]:
|
||||
for _ in range(3):
|
||||
response = self._session.post(
|
||||
"https://web.archive.org/save/",
|
||||
allow_redirects=False,
|
||||
data={"url": url},
|
||||
headers={
|
||||
"Accept": "application/json",
|
||||
"Authorization": f"LOW {ias3_auth}",
|
||||
},
|
||||
)
|
||||
response.raise_for_status()
|
||||
job_id = response.json()["job_id"]
|
||||
|
||||
status = "pending"
|
||||
while status == "pending":
|
||||
time.sleep(5)
|
||||
response = self._session.get(
|
||||
f"https://web.archive.org/save/status/{job_id}"
|
||||
)
|
||||
response.raise_for_status()
|
||||
status = response.json()["status"]
|
||||
|
||||
if status == "success":
|
||||
break
|
||||
|
||||
if response.json()["status_ext"] == "error:service-unavailable":
|
||||
print(response.json()["message"])
|
||||
time.sleep(10)
|
||||
continue # retry
|
||||
elif response.json()["status_ext"] == "error:too-many-daily-captures":
|
||||
# typically happens when a page repeatedly fails so we
|
||||
# (unsuccessfully) tried to capture it too many times
|
||||
return None
|
||||
|
||||
assert False, response.json()
|
||||
else:
|
||||
print("Too many failures; giving up.")
|
||||
return None
|
||||
|
||||
ia_timestamp = response.json()["timestamp"]
|
||||
snapshot_date = _datetime_from_ia_timestamp(ia_timestamp)
|
||||
|
||||
assert snapshot_date >= self.min_snapshot_date, (
|
||||
snapshot_date.isoformat(),
|
||||
self.min_snapshot_date.isoformat(),
|
||||
)
|
||||
response.raise_for_status() # TODO: retry
|
||||
wayback_url = response.headers["Location"]
|
||||
|
||||
wayback_url = f"https://web.archive.org/web/{ia_timestamp}/{url}"
|
||||
return self._fetch_wayback_snapshot(url, wayback_url)
|
||||
|
||||
def _get_cached_snapshot(self, url: str) -> typing.Optional[models.WebPageSnapshot]:
|
||||
|
@ -137,7 +217,9 @@ class Session:
|
|||
|
||||
return snapshot
|
||||
|
||||
def get_or_fetch_snapshot(self, url: str) -> models.WebPageSnapshot:
|
||||
def get_or_fetch_snapshot(
|
||||
self, url: str
|
||||
) -> typing.Optional[models.WebPageSnapshot]:
|
||||
"""
|
||||
Fetches the given URL from the local cache or from the Wayback Machine.
|
||||
|
||||
|
@ -158,5 +240,6 @@ class Session:
|
|||
# If the Internet Archive does not have it yet, trigger its Save Code Now,
|
||||
# and query the Wayback Machine again
|
||||
snapshot = self._save_page_now(url)
|
||||
self._db.add_web_page_snapshots([snapshot])
|
||||
if snapshot is not None:
|
||||
self._db.add_web_page_snapshots([snapshot])
|
||||
return snapshot
|
||||
|
|
|
@ -38,6 +38,26 @@ def requests_mocker():
|
|||
SNAPSHOT_DATE = datetime.datetime(2022, 9, 20, 1, 49, 34, tzinfo=datetime.timezone.utc)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_time(mocker):
|
||||
"""
|
||||
Makes time.sleep return immediately, and returns a callable that returns the
|
||||
total offset introduced by time.sleep calls.
|
||||
"""
|
||||
offset = 0
|
||||
|
||||
def sleep(seconds):
|
||||
nonlocal offset
|
||||
offset += seconds
|
||||
|
||||
mocker.patch("time.sleep", side_effect=sleep)
|
||||
|
||||
def get_offset():
|
||||
return offset
|
||||
|
||||
return get_offset
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def configured_requests_mocker(requests_mocker):
|
||||
"""Extension of :func:`requests_mocker` that registers a bunch of URLs of
|
||||
|
@ -75,13 +95,20 @@ def configured_requests_mocker(requests_mocker):
|
|||
complete_qs=True,
|
||||
text="Example page content from Wayback Machine after Save Page Now",
|
||||
)
|
||||
requests_mocker.register_uri(
|
||||
"GET",
|
||||
"https://web.archive.org/web/20220922000000id_/http://example.org/",
|
||||
complete_qs=True,
|
||||
text="Example page content from Wayback Machine after Save Page Now",
|
||||
)
|
||||
|
||||
# Anonymous SPN:
|
||||
requests_mocker.register_uri(
|
||||
"GET",
|
||||
"https://web.archive.org/save/http://example.org/",
|
||||
complete_qs=True,
|
||||
headers={
|
||||
"location": "https://web.archive.org/web/20220920164222/http://example.org/"
|
||||
"location": "https://web.archive.org/web/20220922000000/http://example.org/"
|
||||
},
|
||||
text="""
|
||||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
|
||||
|
@ -90,6 +117,37 @@ def configured_requests_mocker(requests_mocker):
|
|||
""",
|
||||
)
|
||||
|
||||
# Authenticated SPN:
|
||||
requests_mocker.register_uri(
|
||||
"POST",
|
||||
"https://web.archive.org/save/",
|
||||
complete_qs=True,
|
||||
text='{"url": "http://example.org/", "job_id": "spn2-abcde"}',
|
||||
)
|
||||
requests_mocker.register_uri(
|
||||
"GET",
|
||||
"https://web.archive.org/save/status/spn2-abcde",
|
||||
[
|
||||
dict(
|
||||
text="""
|
||||
{"job_id":"spn2-abcde","resources":["blah"],"status":"pending"}
|
||||
"""
|
||||
),
|
||||
dict(
|
||||
text="""
|
||||
{"job_id":"spn2-abcde","resources":["blah"],"status":"pending"}
|
||||
"""
|
||||
),
|
||||
dict(
|
||||
text="""
|
||||
{"job_id":"spn2-abcde","resources":["blah"],"status":"success",
|
||||
"timestamp":"20220922000000"}
|
||||
"""
|
||||
),
|
||||
],
|
||||
complete_qs=True,
|
||||
)
|
||||
|
||||
yield requests_mocker
|
||||
|
||||
|
||||
|
@ -131,6 +189,7 @@ def test_get__uncached__recent_wb(configured_requests_mocker, opdb_db: Db):
|
|||
snapshot = s.get_or_fetch_snapshot("http://example.org/")
|
||||
dt_after = datetime.datetime.now(tz=datetime.timezone.utc)
|
||||
|
||||
assert snapshot is not None
|
||||
assert dt_before <= snapshot.retrieved_at <= dt_after
|
||||
|
||||
assert snapshot == models.WebPageSnapshot(
|
||||
|
@ -187,27 +246,51 @@ def test_get__expired_cache__recent_wb(configured_requests_mocker, opdb_db: Db):
|
|||
test_get__uncached__recent_wb(configured_requests_mocker, opdb_db)
|
||||
|
||||
|
||||
def test_get__uncached__expired_wb(configured_requests_mocker, opdb_db: Db):
|
||||
@pytest.fixture(
|
||||
params=[
|
||||
pytest.param(False, id="anonymous"),
|
||||
pytest.param(True, id="authenticated"),
|
||||
],
|
||||
)
|
||||
def authenticated(request) -> bool:
|
||||
"""Parametrized by False/True"""
|
||||
return request.param
|
||||
|
||||
|
||||
def test_get__uncached__expired_wb(
|
||||
configured_requests_mocker, opdb_db: Db, authenticated: bool, mock_time
|
||||
):
|
||||
"""
|
||||
Tests getting a snapshot that is not in the local cache, and expired in
|
||||
the Wayback Machine -> uses Save Page Now
|
||||
"""
|
||||
after_date = SNAPSHOT_DATE + datetime.timedelta(days=1)
|
||||
s = Session(opdb_db, after_date)
|
||||
if authenticated:
|
||||
s = Session(opdb_db, after_date, ias3_auth="akey:skey")
|
||||
else:
|
||||
s = Session(opdb_db, after_date)
|
||||
|
||||
assert mock_time() == 0
|
||||
|
||||
dt_before = datetime.datetime.now(tz=datetime.timezone.utc)
|
||||
snapshot = s.get_or_fetch_snapshot("http://example.org/")
|
||||
dt_after = datetime.datetime.now(tz=datetime.timezone.utc)
|
||||
|
||||
if authenticated:
|
||||
assert mock_time() == 15 # three time.sleep(5) calls
|
||||
else:
|
||||
assert mock_time() == 0 # no sleep() because blocking on SPN GET request
|
||||
|
||||
assert snapshot is not None
|
||||
assert dt_before <= snapshot.retrieved_at <= dt_after
|
||||
|
||||
assert snapshot == models.WebPageSnapshot(
|
||||
url="http://example.org/",
|
||||
snapshot_date=datetime.datetime(
|
||||
2022, 9, 20, 16, 42, 22, tzinfo=datetime.timezone.utc
|
||||
2022, 9, 22, 0, 0, 0, tzinfo=datetime.timezone.utc
|
||||
),
|
||||
snapshot_url=(
|
||||
"https://web.archive.org/web/20220920164222id_/" # SPN returns HTTPS URLs
|
||||
"https://web.archive.org/web/20220922000000id_/" # SPN returns HTTPS URLs
|
||||
"http://example.org/"
|
||||
),
|
||||
retrieved_at=snapshot.retrieved_at,
|
||||
|
@ -218,24 +301,54 @@ def test_get__uncached__expired_wb(configured_requests_mocker, opdb_db: Db):
|
|||
|
||||
assert snapshot == opdb_db.get_last_web_page_snapshot("http://example.org/")
|
||||
|
||||
assert [(r.method, r.url) for r in configured_requests_mocker.request_history] == [
|
||||
(
|
||||
"GET",
|
||||
"https://archive.org/wayback/available?url=http%3A%2F%2Fexample.org%2F",
|
||||
),
|
||||
(
|
||||
"GET",
|
||||
"https://web.archive.org/save/http://example.org/",
|
||||
),
|
||||
(
|
||||
"GET",
|
||||
"https://web.archive.org/web/20220920164222id_/" # ditto
|
||||
"http://example.org/",
|
||||
),
|
||||
]
|
||||
if authenticated:
|
||||
assert [
|
||||
(r.method, r.url, r.body)
|
||||
for r in configured_requests_mocker.request_history
|
||||
] == [
|
||||
(
|
||||
"GET",
|
||||
"https://archive.org/wayback/available?url=http%3A%2F%2Fexample.org%2F",
|
||||
None,
|
||||
),
|
||||
(
|
||||
"POST",
|
||||
"https://web.archive.org/save/",
|
||||
"url=http%3A%2F%2Fexample.org%2F",
|
||||
),
|
||||
("GET", "https://web.archive.org/save/status/spn2-abcde", None),
|
||||
("GET", "https://web.archive.org/save/status/spn2-abcde", None),
|
||||
("GET", "https://web.archive.org/save/status/spn2-abcde", None),
|
||||
(
|
||||
"GET",
|
||||
"https://web.archive.org/web/20220922000000id_/" # ditto
|
||||
"http://example.org/",
|
||||
None,
|
||||
),
|
||||
]
|
||||
else:
|
||||
assert [
|
||||
(r.method, r.url) for r in configured_requests_mocker.request_history
|
||||
] == [
|
||||
(
|
||||
"GET",
|
||||
"https://archive.org/wayback/available?url=http%3A%2F%2Fexample.org%2F",
|
||||
),
|
||||
(
|
||||
"GET",
|
||||
"https://web.archive.org/save/http://example.org/",
|
||||
),
|
||||
(
|
||||
"GET",
|
||||
"https://web.archive.org/web/20220922000000id_/" # ditto
|
||||
"http://example.org/",
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
def test_get__expired_cache__expired_wb(configured_requests_mocker, opdb_db: Db):
|
||||
def test_get__expired_cache__expired_wb(
|
||||
configured_requests_mocker, opdb_db: Db, authenticated: bool, mock_time
|
||||
):
|
||||
"""
|
||||
Tests getting a snapshot that is expired in the local cache, and expired in
|
||||
the Wayback Machine -> uses Save Page Now
|
||||
|
@ -256,10 +369,14 @@ def test_get__expired_cache__expired_wb(configured_requests_mocker, opdb_db: Db)
|
|||
opdb_db.add_web_page_snapshots([snapshot])
|
||||
|
||||
# Reuse the other test; web_cache.Session should simply ignore the outdated snapshot
|
||||
test_get__uncached__expired_wb(configured_requests_mocker, opdb_db)
|
||||
test_get__uncached__expired_wb(
|
||||
configured_requests_mocker, opdb_db, authenticated, mock_time
|
||||
)
|
||||
|
||||
|
||||
def test_get__expired_cache__no_wb(configured_requests_mocker, opdb_db: Db):
|
||||
def test_get__expired_cache__no_wb(
|
||||
configured_requests_mocker, opdb_db: Db, authenticated: bool, mock_time
|
||||
):
|
||||
"""
|
||||
Tests getting a snapshot that is expired in the local cache, and absent from
|
||||
the Wayback Machine -> uses Save Page Now
|
||||
|
@ -273,4 +390,6 @@ def test_get__expired_cache__no_wb(configured_requests_mocker, opdb_db: Db):
|
|||
|
||||
# Reuse the other test; web_cache.Session should treat the absence of a page
|
||||
# exactly the same way as an expired one.
|
||||
test_get__uncached__expired_wb(configured_requests_mocker, opdb_db)
|
||||
test_get__uncached__expired_wb(
|
||||
configured_requests_mocker, opdb_db, authenticated, mock_time
|
||||
)
|
||||
|
|
|
@ -15,6 +15,7 @@ dependencies = [
|
|||
[project.optional-dependencies]
|
||||
testing = [
|
||||
"pytest",
|
||||
"pytest-mock",
|
||||
"pytest-postgresql",
|
||||
"requests-mock",
|
||||
"types-requests",
|
||||
|
@ -41,8 +42,12 @@ disable = [
|
|||
"fixme",
|
||||
"invalid-name",
|
||||
"no-else-return",
|
||||
"no-else-continue",
|
||||
"too-few-public-methods",
|
||||
"too-many-instance-attributes",
|
||||
# false positives:
|
||||
"unreachable",
|
||||
"assignment-from-no-return",
|
||||
# mypy does it better:
|
||||
"no-member",
|
||||
"import-error",
|
||||
|
|
Loading…
Reference in New Issue