Add optional support for the authenticated SPN API
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful
It has much higher rate limits.
This commit is contained in:
parent
1db60954d6
commit
1eb169ee6c
|
@ -4,7 +4,7 @@ pipeline:
|
||||||
image: python:3.9
|
image: python:3.9
|
||||||
commands:
|
commands:
|
||||||
- pip3 install flake8
|
- pip3 install flake8
|
||||||
- make flake8
|
- flake8 opdb/
|
||||||
pylint:
|
pylint:
|
||||||
group: lint
|
group: lint
|
||||||
image: cytopia/pylint
|
image: cytopia/pylint
|
||||||
|
|
2
Makefile
2
Makefile
|
@ -10,7 +10,7 @@ black-check:
|
||||||
black:
|
black:
|
||||||
black opdb/
|
black opdb/
|
||||||
|
|
||||||
flake8:
|
flake8: black
|
||||||
flake8 opdb/
|
flake8 opdb/
|
||||||
|
|
||||||
pylint:
|
pylint:
|
||||||
|
|
|
@ -22,6 +22,7 @@ citation, and caches them in the local database for quick access by other worker
|
||||||
import datetime
|
import datetime
|
||||||
import re
|
import re
|
||||||
import socket
|
import socket
|
||||||
|
import time
|
||||||
import typing
|
import typing
|
||||||
|
|
||||||
import pkg_resources
|
import pkg_resources
|
||||||
|
@ -57,11 +58,14 @@ class Session:
|
||||||
archived in the Internet Archive.
|
archived in the Internet Archive.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, db: Db, min_snapshot_date: datetime.datetime):
|
def __init__(
|
||||||
|
self, db: Db, min_snapshot_date: datetime.datetime, ias3_auth: str = None
|
||||||
|
):
|
||||||
self.min_snapshot_date = min_snapshot_date
|
self.min_snapshot_date = min_snapshot_date
|
||||||
self._db = db
|
self._db = db
|
||||||
self._session = requests.Session()
|
self._session = requests.Session()
|
||||||
self._session.headers["User-Agent"] = USER_AGENT
|
self._session.headers["User-Agent"] = USER_AGENT
|
||||||
|
self._ias3_auth = ias3_auth
|
||||||
|
|
||||||
def _fetch_newest_wayback_snapshot(
|
def _fetch_newest_wayback_snapshot(
|
||||||
self, url: str
|
self, url: str
|
||||||
|
@ -122,12 +126,88 @@ class Session:
|
||||||
content=response.content,
|
content=response.content,
|
||||||
)
|
)
|
||||||
|
|
||||||
def _save_page_now(self, url: str) -> models.WebPageSnapshot:
|
def _save_page_now(self, url: str) -> typing.Optional[models.WebPageSnapshot]:
|
||||||
response = self._session.get(
|
if self._ias3_auth is None:
|
||||||
f"https://web.archive.org/save/{url}", allow_redirects=False
|
return self._anonymous_save_page_now(url)
|
||||||
|
else:
|
||||||
|
return self._authenticated_save_page_now(url, self._ias3_auth)
|
||||||
|
|
||||||
|
def _anonymous_save_page_now(
|
||||||
|
self, url: str
|
||||||
|
) -> typing.Optional[models.WebPageSnapshot]:
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
response = self._session.get(
|
||||||
|
f"https://web.archive.org/save/{url}",
|
||||||
|
allow_redirects=False,
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
except requests.exceptions.HTTPError as e:
|
||||||
|
if e.response.status_code == 429:
|
||||||
|
print(e)
|
||||||
|
print("Sleeping...")
|
||||||
|
time.sleep(10)
|
||||||
|
continue
|
||||||
|
elif e.response.status_code == 520:
|
||||||
|
# "Job failed". We will try again in the next workflow run.
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
raise
|
||||||
|
else:
|
||||||
|
wayback_url = response.headers["Location"]
|
||||||
|
return self._fetch_wayback_snapshot(url, wayback_url)
|
||||||
|
|
||||||
|
def _authenticated_save_page_now(
|
||||||
|
self, url: str, ias3_auth: str
|
||||||
|
) -> typing.Optional[models.WebPageSnapshot]:
|
||||||
|
for _ in range(3):
|
||||||
|
response = self._session.post(
|
||||||
|
"https://web.archive.org/save/",
|
||||||
|
allow_redirects=False,
|
||||||
|
data={"url": url},
|
||||||
|
headers={
|
||||||
|
"Accept": "application/json",
|
||||||
|
"Authorization": f"LOW {ias3_auth}",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
job_id = response.json()["job_id"]
|
||||||
|
|
||||||
|
status = "pending"
|
||||||
|
while status == "pending":
|
||||||
|
time.sleep(5)
|
||||||
|
response = self._session.get(
|
||||||
|
f"https://web.archive.org/save/status/{job_id}"
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
status = response.json()["status"]
|
||||||
|
|
||||||
|
if status == "success":
|
||||||
|
break
|
||||||
|
|
||||||
|
if response.json()["status_ext"] == "error:service-unavailable":
|
||||||
|
print(response.json()["message"])
|
||||||
|
time.sleep(10)
|
||||||
|
continue # retry
|
||||||
|
elif response.json()["status_ext"] == "error:too-many-daily-captures":
|
||||||
|
# typically happens when a page repeatedly fails so we
|
||||||
|
# (unsuccessfully) tried to capture it too many times
|
||||||
|
return None
|
||||||
|
|
||||||
|
assert False, response.json()
|
||||||
|
else:
|
||||||
|
print("Too many failures; giving up.")
|
||||||
|
return None
|
||||||
|
|
||||||
|
ia_timestamp = response.json()["timestamp"]
|
||||||
|
snapshot_date = _datetime_from_ia_timestamp(ia_timestamp)
|
||||||
|
|
||||||
|
assert snapshot_date >= self.min_snapshot_date, (
|
||||||
|
snapshot_date.isoformat(),
|
||||||
|
self.min_snapshot_date.isoformat(),
|
||||||
)
|
)
|
||||||
response.raise_for_status() # TODO: retry
|
|
||||||
wayback_url = response.headers["Location"]
|
wayback_url = f"https://web.archive.org/web/{ia_timestamp}/{url}"
|
||||||
return self._fetch_wayback_snapshot(url, wayback_url)
|
return self._fetch_wayback_snapshot(url, wayback_url)
|
||||||
|
|
||||||
def _get_cached_snapshot(self, url: str) -> typing.Optional[models.WebPageSnapshot]:
|
def _get_cached_snapshot(self, url: str) -> typing.Optional[models.WebPageSnapshot]:
|
||||||
|
@ -137,7 +217,9 @@ class Session:
|
||||||
|
|
||||||
return snapshot
|
return snapshot
|
||||||
|
|
||||||
def get_or_fetch_snapshot(self, url: str) -> models.WebPageSnapshot:
|
def get_or_fetch_snapshot(
|
||||||
|
self, url: str
|
||||||
|
) -> typing.Optional[models.WebPageSnapshot]:
|
||||||
"""
|
"""
|
||||||
Fetches the given URL from the local cache or from the Wayback Machine.
|
Fetches the given URL from the local cache or from the Wayback Machine.
|
||||||
|
|
||||||
|
@ -158,5 +240,6 @@ class Session:
|
||||||
# If the Internet Archive does not have it yet, trigger its Save Code Now,
|
# If the Internet Archive does not have it yet, trigger its Save Code Now,
|
||||||
# and query the Wayback Machine again
|
# and query the Wayback Machine again
|
||||||
snapshot = self._save_page_now(url)
|
snapshot = self._save_page_now(url)
|
||||||
self._db.add_web_page_snapshots([snapshot])
|
if snapshot is not None:
|
||||||
|
self._db.add_web_page_snapshots([snapshot])
|
||||||
return snapshot
|
return snapshot
|
||||||
|
|
|
@ -38,6 +38,26 @@ def requests_mocker():
|
||||||
SNAPSHOT_DATE = datetime.datetime(2022, 9, 20, 1, 49, 34, tzinfo=datetime.timezone.utc)
|
SNAPSHOT_DATE = datetime.datetime(2022, 9, 20, 1, 49, 34, tzinfo=datetime.timezone.utc)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def mock_time(mocker):
|
||||||
|
"""
|
||||||
|
Makes time.sleep return immediately, and returns a callable that returns the
|
||||||
|
total offset introduced by time.sleep calls.
|
||||||
|
"""
|
||||||
|
offset = 0
|
||||||
|
|
||||||
|
def sleep(seconds):
|
||||||
|
nonlocal offset
|
||||||
|
offset += seconds
|
||||||
|
|
||||||
|
mocker.patch("time.sleep", side_effect=sleep)
|
||||||
|
|
||||||
|
def get_offset():
|
||||||
|
return offset
|
||||||
|
|
||||||
|
return get_offset
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def configured_requests_mocker(requests_mocker):
|
def configured_requests_mocker(requests_mocker):
|
||||||
"""Extension of :func:`requests_mocker` that registers a bunch of URLs of
|
"""Extension of :func:`requests_mocker` that registers a bunch of URLs of
|
||||||
|
@ -75,13 +95,20 @@ def configured_requests_mocker(requests_mocker):
|
||||||
complete_qs=True,
|
complete_qs=True,
|
||||||
text="Example page content from Wayback Machine after Save Page Now",
|
text="Example page content from Wayback Machine after Save Page Now",
|
||||||
)
|
)
|
||||||
|
requests_mocker.register_uri(
|
||||||
|
"GET",
|
||||||
|
"https://web.archive.org/web/20220922000000id_/http://example.org/",
|
||||||
|
complete_qs=True,
|
||||||
|
text="Example page content from Wayback Machine after Save Page Now",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Anonymous SPN:
|
||||||
requests_mocker.register_uri(
|
requests_mocker.register_uri(
|
||||||
"GET",
|
"GET",
|
||||||
"https://web.archive.org/save/http://example.org/",
|
"https://web.archive.org/save/http://example.org/",
|
||||||
complete_qs=True,
|
complete_qs=True,
|
||||||
headers={
|
headers={
|
||||||
"location": "https://web.archive.org/web/20220920164222/http://example.org/"
|
"location": "https://web.archive.org/web/20220922000000/http://example.org/"
|
||||||
},
|
},
|
||||||
text="""
|
text="""
|
||||||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
|
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
|
||||||
|
@ -90,6 +117,37 @@ def configured_requests_mocker(requests_mocker):
|
||||||
""",
|
""",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Authenticated SPN:
|
||||||
|
requests_mocker.register_uri(
|
||||||
|
"POST",
|
||||||
|
"https://web.archive.org/save/",
|
||||||
|
complete_qs=True,
|
||||||
|
text='{"url": "http://example.org/", "job_id": "spn2-abcde"}',
|
||||||
|
)
|
||||||
|
requests_mocker.register_uri(
|
||||||
|
"GET",
|
||||||
|
"https://web.archive.org/save/status/spn2-abcde",
|
||||||
|
[
|
||||||
|
dict(
|
||||||
|
text="""
|
||||||
|
{"job_id":"spn2-abcde","resources":["blah"],"status":"pending"}
|
||||||
|
"""
|
||||||
|
),
|
||||||
|
dict(
|
||||||
|
text="""
|
||||||
|
{"job_id":"spn2-abcde","resources":["blah"],"status":"pending"}
|
||||||
|
"""
|
||||||
|
),
|
||||||
|
dict(
|
||||||
|
text="""
|
||||||
|
{"job_id":"spn2-abcde","resources":["blah"],"status":"success",
|
||||||
|
"timestamp":"20220922000000"}
|
||||||
|
"""
|
||||||
|
),
|
||||||
|
],
|
||||||
|
complete_qs=True,
|
||||||
|
)
|
||||||
|
|
||||||
yield requests_mocker
|
yield requests_mocker
|
||||||
|
|
||||||
|
|
||||||
|
@ -131,6 +189,7 @@ def test_get__uncached__recent_wb(configured_requests_mocker, opdb_db: Db):
|
||||||
snapshot = s.get_or_fetch_snapshot("http://example.org/")
|
snapshot = s.get_or_fetch_snapshot("http://example.org/")
|
||||||
dt_after = datetime.datetime.now(tz=datetime.timezone.utc)
|
dt_after = datetime.datetime.now(tz=datetime.timezone.utc)
|
||||||
|
|
||||||
|
assert snapshot is not None
|
||||||
assert dt_before <= snapshot.retrieved_at <= dt_after
|
assert dt_before <= snapshot.retrieved_at <= dt_after
|
||||||
|
|
||||||
assert snapshot == models.WebPageSnapshot(
|
assert snapshot == models.WebPageSnapshot(
|
||||||
|
@ -187,27 +246,51 @@ def test_get__expired_cache__recent_wb(configured_requests_mocker, opdb_db: Db):
|
||||||
test_get__uncached__recent_wb(configured_requests_mocker, opdb_db)
|
test_get__uncached__recent_wb(configured_requests_mocker, opdb_db)
|
||||||
|
|
||||||
|
|
||||||
def test_get__uncached__expired_wb(configured_requests_mocker, opdb_db: Db):
|
@pytest.fixture(
|
||||||
|
params=[
|
||||||
|
pytest.param(False, id="anonymous"),
|
||||||
|
pytest.param(True, id="authenticated"),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def authenticated(request) -> bool:
|
||||||
|
"""Parametrized by False/True"""
|
||||||
|
return request.param
|
||||||
|
|
||||||
|
|
||||||
|
def test_get__uncached__expired_wb(
|
||||||
|
configured_requests_mocker, opdb_db: Db, authenticated: bool, mock_time
|
||||||
|
):
|
||||||
"""
|
"""
|
||||||
Tests getting a snapshot that is not in the local cache, and expired in
|
Tests getting a snapshot that is not in the local cache, and expired in
|
||||||
the Wayback Machine -> uses Save Page Now
|
the Wayback Machine -> uses Save Page Now
|
||||||
"""
|
"""
|
||||||
after_date = SNAPSHOT_DATE + datetime.timedelta(days=1)
|
after_date = SNAPSHOT_DATE + datetime.timedelta(days=1)
|
||||||
s = Session(opdb_db, after_date)
|
if authenticated:
|
||||||
|
s = Session(opdb_db, after_date, ias3_auth="akey:skey")
|
||||||
|
else:
|
||||||
|
s = Session(opdb_db, after_date)
|
||||||
|
|
||||||
|
assert mock_time() == 0
|
||||||
|
|
||||||
dt_before = datetime.datetime.now(tz=datetime.timezone.utc)
|
dt_before = datetime.datetime.now(tz=datetime.timezone.utc)
|
||||||
snapshot = s.get_or_fetch_snapshot("http://example.org/")
|
snapshot = s.get_or_fetch_snapshot("http://example.org/")
|
||||||
dt_after = datetime.datetime.now(tz=datetime.timezone.utc)
|
dt_after = datetime.datetime.now(tz=datetime.timezone.utc)
|
||||||
|
|
||||||
|
if authenticated:
|
||||||
|
assert mock_time() == 15 # three time.sleep(5) calls
|
||||||
|
else:
|
||||||
|
assert mock_time() == 0 # no sleep() because blocking on SPN GET request
|
||||||
|
|
||||||
|
assert snapshot is not None
|
||||||
assert dt_before <= snapshot.retrieved_at <= dt_after
|
assert dt_before <= snapshot.retrieved_at <= dt_after
|
||||||
|
|
||||||
assert snapshot == models.WebPageSnapshot(
|
assert snapshot == models.WebPageSnapshot(
|
||||||
url="http://example.org/",
|
url="http://example.org/",
|
||||||
snapshot_date=datetime.datetime(
|
snapshot_date=datetime.datetime(
|
||||||
2022, 9, 20, 16, 42, 22, tzinfo=datetime.timezone.utc
|
2022, 9, 22, 0, 0, 0, tzinfo=datetime.timezone.utc
|
||||||
),
|
),
|
||||||
snapshot_url=(
|
snapshot_url=(
|
||||||
"https://web.archive.org/web/20220920164222id_/" # SPN returns HTTPS URLs
|
"https://web.archive.org/web/20220922000000id_/" # SPN returns HTTPS URLs
|
||||||
"http://example.org/"
|
"http://example.org/"
|
||||||
),
|
),
|
||||||
retrieved_at=snapshot.retrieved_at,
|
retrieved_at=snapshot.retrieved_at,
|
||||||
|
@ -218,24 +301,54 @@ def test_get__uncached__expired_wb(configured_requests_mocker, opdb_db: Db):
|
||||||
|
|
||||||
assert snapshot == opdb_db.get_last_web_page_snapshot("http://example.org/")
|
assert snapshot == opdb_db.get_last_web_page_snapshot("http://example.org/")
|
||||||
|
|
||||||
assert [(r.method, r.url) for r in configured_requests_mocker.request_history] == [
|
if authenticated:
|
||||||
(
|
assert [
|
||||||
"GET",
|
(r.method, r.url, r.body)
|
||||||
"https://archive.org/wayback/available?url=http%3A%2F%2Fexample.org%2F",
|
for r in configured_requests_mocker.request_history
|
||||||
),
|
] == [
|
||||||
(
|
(
|
||||||
"GET",
|
"GET",
|
||||||
"https://web.archive.org/save/http://example.org/",
|
"https://archive.org/wayback/available?url=http%3A%2F%2Fexample.org%2F",
|
||||||
),
|
None,
|
||||||
(
|
),
|
||||||
"GET",
|
(
|
||||||
"https://web.archive.org/web/20220920164222id_/" # ditto
|
"POST",
|
||||||
"http://example.org/",
|
"https://web.archive.org/save/",
|
||||||
),
|
"url=http%3A%2F%2Fexample.org%2F",
|
||||||
]
|
),
|
||||||
|
("GET", "https://web.archive.org/save/status/spn2-abcde", None),
|
||||||
|
("GET", "https://web.archive.org/save/status/spn2-abcde", None),
|
||||||
|
("GET", "https://web.archive.org/save/status/spn2-abcde", None),
|
||||||
|
(
|
||||||
|
"GET",
|
||||||
|
"https://web.archive.org/web/20220922000000id_/" # ditto
|
||||||
|
"http://example.org/",
|
||||||
|
None,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
assert [
|
||||||
|
(r.method, r.url) for r in configured_requests_mocker.request_history
|
||||||
|
] == [
|
||||||
|
(
|
||||||
|
"GET",
|
||||||
|
"https://archive.org/wayback/available?url=http%3A%2F%2Fexample.org%2F",
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"GET",
|
||||||
|
"https://web.archive.org/save/http://example.org/",
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"GET",
|
||||||
|
"https://web.archive.org/web/20220922000000id_/" # ditto
|
||||||
|
"http://example.org/",
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
def test_get__expired_cache__expired_wb(configured_requests_mocker, opdb_db: Db):
|
def test_get__expired_cache__expired_wb(
|
||||||
|
configured_requests_mocker, opdb_db: Db, authenticated: bool, mock_time
|
||||||
|
):
|
||||||
"""
|
"""
|
||||||
Tests getting a snapshot that is expired in the local cache, and expired in
|
Tests getting a snapshot that is expired in the local cache, and expired in
|
||||||
the Wayback Machine -> uses Save Page Now
|
the Wayback Machine -> uses Save Page Now
|
||||||
|
@ -256,10 +369,14 @@ def test_get__expired_cache__expired_wb(configured_requests_mocker, opdb_db: Db)
|
||||||
opdb_db.add_web_page_snapshots([snapshot])
|
opdb_db.add_web_page_snapshots([snapshot])
|
||||||
|
|
||||||
# Reuse the other test; web_cache.Session should simply ignore the outdated snapshot
|
# Reuse the other test; web_cache.Session should simply ignore the outdated snapshot
|
||||||
test_get__uncached__expired_wb(configured_requests_mocker, opdb_db)
|
test_get__uncached__expired_wb(
|
||||||
|
configured_requests_mocker, opdb_db, authenticated, mock_time
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_get__expired_cache__no_wb(configured_requests_mocker, opdb_db: Db):
|
def test_get__expired_cache__no_wb(
|
||||||
|
configured_requests_mocker, opdb_db: Db, authenticated: bool, mock_time
|
||||||
|
):
|
||||||
"""
|
"""
|
||||||
Tests getting a snapshot that is expired in the local cache, and absent from
|
Tests getting a snapshot that is expired in the local cache, and absent from
|
||||||
the Wayback Machine -> uses Save Page Now
|
the Wayback Machine -> uses Save Page Now
|
||||||
|
@ -273,4 +390,6 @@ def test_get__expired_cache__no_wb(configured_requests_mocker, opdb_db: Db):
|
||||||
|
|
||||||
# Reuse the other test; web_cache.Session should treat the absence of a page
|
# Reuse the other test; web_cache.Session should treat the absence of a page
|
||||||
# exactly the same way as an expired one.
|
# exactly the same way as an expired one.
|
||||||
test_get__uncached__expired_wb(configured_requests_mocker, opdb_db)
|
test_get__uncached__expired_wb(
|
||||||
|
configured_requests_mocker, opdb_db, authenticated, mock_time
|
||||||
|
)
|
||||||
|
|
|
@ -15,6 +15,7 @@ dependencies = [
|
||||||
[project.optional-dependencies]
|
[project.optional-dependencies]
|
||||||
testing = [
|
testing = [
|
||||||
"pytest",
|
"pytest",
|
||||||
|
"pytest-mock",
|
||||||
"pytest-postgresql",
|
"pytest-postgresql",
|
||||||
"requests-mock",
|
"requests-mock",
|
||||||
"types-requests",
|
"types-requests",
|
||||||
|
@ -41,8 +42,12 @@ disable = [
|
||||||
"fixme",
|
"fixme",
|
||||||
"invalid-name",
|
"invalid-name",
|
||||||
"no-else-return",
|
"no-else-return",
|
||||||
|
"no-else-continue",
|
||||||
"too-few-public-methods",
|
"too-few-public-methods",
|
||||||
"too-many-instance-attributes",
|
"too-many-instance-attributes",
|
||||||
|
# false positives:
|
||||||
|
"unreachable",
|
||||||
|
"assignment-from-no-return",
|
||||||
# mypy does it better:
|
# mypy does it better:
|
||||||
"no-member",
|
"no-member",
|
||||||
"import-error",
|
"import-error",
|
||||||
|
|
Loading…
Reference in New Issue