diff --git a/.woodpecker.yml b/.woodpecker.yml index 9b41b4b..0456108 100644 --- a/.woodpecker.yml +++ b/.woodpecker.yml @@ -4,7 +4,7 @@ pipeline: image: python:3.9 commands: - pip3 install flake8 - - make flake8 + - flake8 opdb/ pylint: group: lint image: cytopia/pylint diff --git a/Makefile b/Makefile index 92aec84..eac0f1c 100644 --- a/Makefile +++ b/Makefile @@ -10,7 +10,7 @@ black-check: black: black opdb/ -flake8: +flake8: black flake8 opdb/ pylint: diff --git a/opdb/web_cache.py b/opdb/web_cache.py index c8de61d..e432e8c 100644 --- a/opdb/web_cache.py +++ b/opdb/web_cache.py @@ -22,6 +22,7 @@ citation, and caches them in the local database for quick access by other worker import datetime import re import socket +import time import typing import pkg_resources @@ -57,11 +58,14 @@ class Session: archived in the Internet Archive. """ - def __init__(self, db: Db, min_snapshot_date: datetime.datetime): + def __init__( + self, db: Db, min_snapshot_date: datetime.datetime, ias3_auth: str = None + ): self.min_snapshot_date = min_snapshot_date self._db = db self._session = requests.Session() self._session.headers["User-Agent"] = USER_AGENT + self._ias3_auth = ias3_auth def _fetch_newest_wayback_snapshot( self, url: str @@ -122,12 +126,88 @@ class Session: content=response.content, ) - def _save_page_now(self, url: str) -> models.WebPageSnapshot: - response = self._session.get( - f"https://web.archive.org/save/{url}", allow_redirects=False + def _save_page_now(self, url: str) -> typing.Optional[models.WebPageSnapshot]: + if self._ias3_auth is None: + return self._anonymous_save_page_now(url) + else: + return self._authenticated_save_page_now(url, self._ias3_auth) + + def _anonymous_save_page_now( + self, url: str + ) -> typing.Optional[models.WebPageSnapshot]: + while True: + try: + response = self._session.get( + f"https://web.archive.org/save/{url}", + allow_redirects=False, + ) + response.raise_for_status() + except requests.exceptions.HTTPError as e: + if e.response.status_code == 429: + print(e) + print("Sleeping...") + time.sleep(10) + continue + elif e.response.status_code == 520: + # "Job failed". We will try again in the next workflow run. + return None + else: + raise + else: + wayback_url = response.headers["Location"] + return self._fetch_wayback_snapshot(url, wayback_url) + + def _authenticated_save_page_now( + self, url: str, ias3_auth: str + ) -> typing.Optional[models.WebPageSnapshot]: + for _ in range(3): + response = self._session.post( + "https://web.archive.org/save/", + allow_redirects=False, + data={"url": url}, + headers={ + "Accept": "application/json", + "Authorization": f"LOW {ias3_auth}", + }, + ) + response.raise_for_status() + job_id = response.json()["job_id"] + + status = "pending" + while status == "pending": + time.sleep(5) + response = self._session.get( + f"https://web.archive.org/save/status/{job_id}" + ) + response.raise_for_status() + status = response.json()["status"] + + if status == "success": + break + + if response.json()["status_ext"] == "error:service-unavailable": + print(response.json()["message"]) + time.sleep(10) + continue # retry + elif response.json()["status_ext"] == "error:too-many-daily-captures": + # typically happens when a page repeatedly fails so we + # (unsuccessfully) tried to capture it too many times + return None + + assert False, response.json() + else: + print("Too many failures; giving up.") + return None + + ia_timestamp = response.json()["timestamp"] + snapshot_date = _datetime_from_ia_timestamp(ia_timestamp) + + assert snapshot_date >= self.min_snapshot_date, ( + snapshot_date.isoformat(), + self.min_snapshot_date.isoformat(), ) - response.raise_for_status() # TODO: retry - wayback_url = response.headers["Location"] + + wayback_url = f"https://web.archive.org/web/{ia_timestamp}/{url}" return self._fetch_wayback_snapshot(url, wayback_url) def _get_cached_snapshot(self, url: str) -> typing.Optional[models.WebPageSnapshot]: @@ -137,7 +217,9 @@ class Session: return snapshot - def get_or_fetch_snapshot(self, url: str) -> models.WebPageSnapshot: + def get_or_fetch_snapshot( + self, url: str + ) -> typing.Optional[models.WebPageSnapshot]: """ Fetches the given URL from the local cache or from the Wayback Machine. @@ -158,5 +240,6 @@ class Session: # If the Internet Archive does not have it yet, trigger its Save Code Now, # and query the Wayback Machine again snapshot = self._save_page_now(url) - self._db.add_web_page_snapshots([snapshot]) + if snapshot is not None: + self._db.add_web_page_snapshots([snapshot]) return snapshot diff --git a/opdb/web_cache_test.py b/opdb/web_cache_test.py index dce341b..e8da069 100644 --- a/opdb/web_cache_test.py +++ b/opdb/web_cache_test.py @@ -38,6 +38,26 @@ def requests_mocker(): SNAPSHOT_DATE = datetime.datetime(2022, 9, 20, 1, 49, 34, tzinfo=datetime.timezone.utc) +@pytest.fixture +def mock_time(mocker): + """ + Makes time.sleep return immediately, and returns a callable that returns the + total offset introduced by time.sleep calls. + """ + offset = 0 + + def sleep(seconds): + nonlocal offset + offset += seconds + + mocker.patch("time.sleep", side_effect=sleep) + + def get_offset(): + return offset + + return get_offset + + @pytest.fixture def configured_requests_mocker(requests_mocker): """Extension of :func:`requests_mocker` that registers a bunch of URLs of @@ -75,13 +95,20 @@ def configured_requests_mocker(requests_mocker): complete_qs=True, text="Example page content from Wayback Machine after Save Page Now", ) + requests_mocker.register_uri( + "GET", + "https://web.archive.org/web/20220922000000id_/http://example.org/", + complete_qs=True, + text="Example page content from Wayback Machine after Save Page Now", + ) + # Anonymous SPN: requests_mocker.register_uri( "GET", "https://web.archive.org/save/http://example.org/", complete_qs=True, headers={ - "location": "https://web.archive.org/web/20220920164222/http://example.org/" + "location": "https://web.archive.org/web/20220922000000/http://example.org/" }, text=""" @@ -90,6 +117,37 @@ def configured_requests_mocker(requests_mocker): """, ) + # Authenticated SPN: + requests_mocker.register_uri( + "POST", + "https://web.archive.org/save/", + complete_qs=True, + text='{"url": "http://example.org/", "job_id": "spn2-abcde"}', + ) + requests_mocker.register_uri( + "GET", + "https://web.archive.org/save/status/spn2-abcde", + [ + dict( + text=""" + {"job_id":"spn2-abcde","resources":["blah"],"status":"pending"} + """ + ), + dict( + text=""" + {"job_id":"spn2-abcde","resources":["blah"],"status":"pending"} + """ + ), + dict( + text=""" + {"job_id":"spn2-abcde","resources":["blah"],"status":"success", + "timestamp":"20220922000000"} + """ + ), + ], + complete_qs=True, + ) + yield requests_mocker @@ -131,6 +189,7 @@ def test_get__uncached__recent_wb(configured_requests_mocker, opdb_db: Db): snapshot = s.get_or_fetch_snapshot("http://example.org/") dt_after = datetime.datetime.now(tz=datetime.timezone.utc) + assert snapshot is not None assert dt_before <= snapshot.retrieved_at <= dt_after assert snapshot == models.WebPageSnapshot( @@ -187,27 +246,51 @@ def test_get__expired_cache__recent_wb(configured_requests_mocker, opdb_db: Db): test_get__uncached__recent_wb(configured_requests_mocker, opdb_db) -def test_get__uncached__expired_wb(configured_requests_mocker, opdb_db: Db): +@pytest.fixture( + params=[ + pytest.param(False, id="anonymous"), + pytest.param(True, id="authenticated"), + ], +) +def authenticated(request) -> bool: + """Parametrized by False/True""" + return request.param + + +def test_get__uncached__expired_wb( + configured_requests_mocker, opdb_db: Db, authenticated: bool, mock_time +): """ Tests getting a snapshot that is not in the local cache, and expired in the Wayback Machine -> uses Save Page Now """ after_date = SNAPSHOT_DATE + datetime.timedelta(days=1) - s = Session(opdb_db, after_date) + if authenticated: + s = Session(opdb_db, after_date, ias3_auth="akey:skey") + else: + s = Session(opdb_db, after_date) + + assert mock_time() == 0 dt_before = datetime.datetime.now(tz=datetime.timezone.utc) snapshot = s.get_or_fetch_snapshot("http://example.org/") dt_after = datetime.datetime.now(tz=datetime.timezone.utc) + if authenticated: + assert mock_time() == 15 # three time.sleep(5) calls + else: + assert mock_time() == 0 # no sleep() because blocking on SPN GET request + + assert snapshot is not None assert dt_before <= snapshot.retrieved_at <= dt_after assert snapshot == models.WebPageSnapshot( url="http://example.org/", snapshot_date=datetime.datetime( - 2022, 9, 20, 16, 42, 22, tzinfo=datetime.timezone.utc + 2022, 9, 22, 0, 0, 0, tzinfo=datetime.timezone.utc ), snapshot_url=( - "https://web.archive.org/web/20220920164222id_/" # SPN returns HTTPS URLs + "https://web.archive.org/web/20220922000000id_/" # SPN returns HTTPS URLs "http://example.org/" ), retrieved_at=snapshot.retrieved_at, @@ -218,24 +301,54 @@ def test_get__uncached__expired_wb(configured_requests_mocker, opdb_db: Db): assert snapshot == opdb_db.get_last_web_page_snapshot("http://example.org/") - assert [(r.method, r.url) for r in configured_requests_mocker.request_history] == [ - ( - "GET", - "https://archive.org/wayback/available?url=http%3A%2F%2Fexample.org%2F", - ), - ( - "GET", - "https://web.archive.org/save/http://example.org/", - ), - ( - "GET", - "https://web.archive.org/web/20220920164222id_/" # ditto - "http://example.org/", - ), - ] + if authenticated: + assert [ + (r.method, r.url, r.body) + for r in configured_requests_mocker.request_history + ] == [ + ( + "GET", + "https://archive.org/wayback/available?url=http%3A%2F%2Fexample.org%2F", + None, + ), + ( + "POST", + "https://web.archive.org/save/", + "url=http%3A%2F%2Fexample.org%2F", + ), + ("GET", "https://web.archive.org/save/status/spn2-abcde", None), + ("GET", "https://web.archive.org/save/status/spn2-abcde", None), + ("GET", "https://web.archive.org/save/status/spn2-abcde", None), + ( + "GET", + "https://web.archive.org/web/20220922000000id_/" # ditto + "http://example.org/", + None, + ), + ] + else: + assert [ + (r.method, r.url) for r in configured_requests_mocker.request_history + ] == [ + ( + "GET", + "https://archive.org/wayback/available?url=http%3A%2F%2Fexample.org%2F", + ), + ( + "GET", + "https://web.archive.org/save/http://example.org/", + ), + ( + "GET", + "https://web.archive.org/web/20220922000000id_/" # ditto + "http://example.org/", + ), + ] -def test_get__expired_cache__expired_wb(configured_requests_mocker, opdb_db: Db): +def test_get__expired_cache__expired_wb( + configured_requests_mocker, opdb_db: Db, authenticated: bool, mock_time +): """ Tests getting a snapshot that is expired in the local cache, and expired in the Wayback Machine -> uses Save Page Now @@ -256,10 +369,14 @@ def test_get__expired_cache__expired_wb(configured_requests_mocker, opdb_db: Db) opdb_db.add_web_page_snapshots([snapshot]) # Reuse the other test; web_cache.Session should simply ignore the outdated snapshot - test_get__uncached__expired_wb(configured_requests_mocker, opdb_db) + test_get__uncached__expired_wb( + configured_requests_mocker, opdb_db, authenticated, mock_time + ) -def test_get__expired_cache__no_wb(configured_requests_mocker, opdb_db: Db): +def test_get__expired_cache__no_wb( + configured_requests_mocker, opdb_db: Db, authenticated: bool, mock_time +): """ Tests getting a snapshot that is expired in the local cache, and absent from the Wayback Machine -> uses Save Page Now @@ -273,4 +390,6 @@ def test_get__expired_cache__no_wb(configured_requests_mocker, opdb_db: Db): # Reuse the other test; web_cache.Session should treat the absence of a page # exactly the same way as an expired one. - test_get__uncached__expired_wb(configured_requests_mocker, opdb_db) + test_get__uncached__expired_wb( + configured_requests_mocker, opdb_db, authenticated, mock_time + ) diff --git a/pyproject.toml b/pyproject.toml index bd1be7d..8fe9e54 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,6 +15,7 @@ dependencies = [ [project.optional-dependencies] testing = [ "pytest", + "pytest-mock", "pytest-postgresql", "requests-mock", "types-requests", @@ -41,8 +42,12 @@ disable = [ "fixme", "invalid-name", "no-else-return", + "no-else-continue", "too-few-public-methods", "too-many-instance-attributes", + # false positives: + "unreachable", + "assignment-from-no-return", # mypy does it better: "no-member", "import-error",