From 55c48b077b213813d744ab13e9dafe921aa68282 Mon Sep 17 00:00:00 2001 From: jackwener Date: Sat, 7 Mar 2026 19:02:49 +0800 Subject: [PATCH] feat: add rate limiting, retry with backoff, and max count cap - Add configurable request delay between paginated API calls (default 1.5s) - Add retry with exponential backoff on HTTP 429 and Twitter error code 88 - Add hard max count cap (default 200, absolute ceiling 500) - Add rateLimit config section with requestDelay, maxRetries, retryBaseDelay, maxCount - Add normalization tests for rateLimit config --- config.yaml | 6 +++ tests/test_cli.py | 2 +- tests/test_config_normalization.py | 27 ++++++++++ twitter_cli/cli.py | 17 +++--- twitter_cli/client.py | 87 ++++++++++++++++++++++-------- twitter_cli/config.py | 17 ++++++ 6 files changed, 125 insertions(+), 31 deletions(-) diff --git a/config.yaml b/config.yaml index 2649084..0a62952 100644 --- a/config.yaml +++ b/config.yaml @@ -13,3 +13,9 @@ filter: replies: 2.0 bookmarks: 5.0 views_log: 0.5 + +rateLimit: + requestDelay: 1.5 # seconds between paginated requests + maxRetries: 3 # retry count on 429 / rate-limit errors + retryBaseDelay: 5.0 # base delay for exponential backoff (seconds) + maxCount: 200 # hard cap for single fetch diff --git a/tests/test_cli.py b/tests/test_cli.py index d818867..baf87d9 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -12,7 +12,7 @@ def test_cli_user_command_works_with_client_factory(monkeypatch) -> None: def fetch_user(self, screen_name: str) -> UserProfile: return UserProfile(id="1", name="Alice", screen_name=screen_name) - monkeypatch.setattr("twitter_cli.cli._get_client", lambda: FakeClient()) + monkeypatch.setattr("twitter_cli.cli._get_client", lambda config=None: FakeClient()) runner = CliRunner() result = runner.invoke(cli, ["user", "alice"]) assert result.exit_code == 0 diff --git a/tests/test_config_normalization.py b/tests/test_config_normalization.py index a9a2d08..450e0bc 100644 --- a/tests/test_config_normalization.py +++ b/tests/test_config_normalization.py @@ -33,3 +33,30 @@ def test_filter_normalization_for_invalid_values(tmp_path: Path) -> None: assert config["filter"]["lang"] == [] assert config["filter"]["weights"]["likes"] == 1.0 assert config["filter"]["weights"]["retweets"] == 4.0 + # rateLimit should get defaults since it wasn't in the yaml + assert config["rateLimit"]["requestDelay"] == 1.5 + assert config["rateLimit"]["maxRetries"] == 3 + assert config["rateLimit"]["retryBaseDelay"] == 5.0 + assert config["rateLimit"]["maxCount"] == 200 + + +def test_rate_limit_normalization(tmp_path: Path) -> None: + config_file = tmp_path / "config.yaml" + config_file.write_text( + "\n".join( + [ + "rateLimit:", + " requestDelay: -2", + " maxRetries: bad", + " retryBaseDelay: 0.1", + " maxCount: 0", + ] + ), + encoding="utf-8", + ) + + config = load_config(str(config_file)) + assert config["rateLimit"]["requestDelay"] == 0.0 # clamped to >= 0 + assert config["rateLimit"]["maxRetries"] == 3 # fallback to default + assert config["rateLimit"]["retryBaseDelay"] == 1.0 # clamped to >= 1.0 + assert config["rateLimit"]["maxCount"] == 1 # clamped to >= 1 diff --git a/twitter_cli/cli.py b/twitter_cli/cli.py index 6c83f96..1bb7149 100644 --- a/twitter_cli/cli.py +++ b/twitter_cli/cli.py @@ -60,15 +60,16 @@ def _load_tweets_from_json(path): raise RuntimeError("Invalid tweet JSON file %s: %s" % (path, exc)) -def _get_client(): - # type: () -> TwitterClient +def _get_client(config=None): + # type: (Optional[Dict[str, Any]]) -> TwitterClient """Create an authenticated API client.""" console.print("\nšŸ” Getting Twitter cookies...") try: cookies = get_cookies() except RuntimeError as exc: raise RuntimeError(str(exc)) - return TwitterClient(cookies["auth_token"], cookies["ct0"]) + rate_limit_config = (config or {}).get("rateLimit") + return TwitterClient(cookies["auth_token"], cookies["ct0"], rate_limit_config) def _resolve_fetch_count(max_count, configured): @@ -128,7 +129,7 @@ def feed(feed_type, max_count, as_json, input_file, output_file, do_filter): console.print(" Loaded %d tweets" % len(tweets)) else: fetch_count = _resolve_fetch_count(max_count, config.get("fetch", {}).get("count", 50)) - client = _get_client() + client = _get_client(config) label = "following feed" if feed_type == "following" else "home timeline" console.print("šŸ“” Fetching %s (%d tweets)...\n" % (label, fetch_count)) start = time.time() @@ -169,7 +170,7 @@ def favorite(max_count, as_json, output_file, do_filter): config = load_config() try: fetch_count = _resolve_fetch_count(max_count, config.get("fetch", {}).get("count", 50)) - client = _get_client() + client = _get_client(config) console.print("šŸ”– Fetching favorites (%d tweets)...\n" % fetch_count) start = time.time() tweets = client.fetch_bookmarks(fetch_count) @@ -199,8 +200,9 @@ def user(screen_name): # type: (str,) -> None """View a user's profile. SCREEN_NAME is the @handle (without @).""" screen_name = screen_name.lstrip("@") + config = load_config() try: - client = _get_client() + client = _get_client(config) console.print("šŸ‘¤ Fetching user @%s..." % screen_name) profile = client.fetch_user(screen_name) except RuntimeError as exc: @@ -219,9 +221,10 @@ def user_posts(screen_name, max_count, as_json): # type: (str, int, bool) -> None """List a user's tweets. SCREEN_NAME is the @handle (without @).""" screen_name = screen_name.lstrip("@") + config = load_config() try: fetch_count = _resolve_fetch_count(max_count, 20) - client = _get_client() + client = _get_client(config) console.print("šŸ‘¤ Fetching @%s's profile..." % screen_name) profile = client.fetch_user(screen_name) console.print("šŸ“ Fetching tweets (%d)...\n" % fetch_count) diff --git a/twitter_cli/client.py b/twitter_cli/client.py index fd208b3..cbc6231 100644 --- a/twitter_cli/client.py +++ b/twitter_cli/client.py @@ -6,6 +6,7 @@ import json import logging import math import re +import time import ssl import urllib.error import urllib.parse @@ -201,13 +202,22 @@ def _resolve_query_id(operation_name, prefer_fallback=True): raise RuntimeError('Cannot resolve queryId for "%s"' % operation_name) +# Hard ceiling to prevent accidental massive fetches +_ABSOLUTE_MAX_COUNT = 500 + + class TwitterClient: """Twitter GraphQL API client using cookie authentication.""" - def __init__(self, auth_token, ct0): - # type: (str, str) -> None + def __init__(self, auth_token, ct0, rate_limit_config=None): + # type: (str, str, Optional[Dict[str, Any]]) -> None self._auth_token = auth_token self._ct0 = ct0 + rl = rate_limit_config or {} + self._request_delay = float(rl.get("requestDelay", 1.5)) + self._max_retries = int(rl.get("maxRetries", 3)) + self._retry_base_delay = float(rl.get("retryBaseDelay", 5.0)) + self._max_count = min(int(rl.get("maxCount", 200)), _ABSOLUTE_MAX_COUNT) def fetch_home_timeline(self, count=20): # type: (int) -> List[Tweet] @@ -308,6 +318,9 @@ class TwitterClient: if count <= 0: return [] + # Enforce max count cap + count = min(count, self._max_count) + tweets = [] # type: List[Tweet] seen_ids = set() # type: Set[str] cursor = None # type: Optional[str] @@ -339,6 +352,11 @@ class TwitterClient: break cursor = next_cursor + # Rate-limit: sleep between paginated requests + if len(tweets) < count and self._request_delay > 0: + logger.debug("Sleeping %.1fs between requests", self._request_delay) + time.sleep(self._request_delay) + return tweets[:count] def _graphql_get(self, operation_name, variables, features): @@ -379,31 +397,54 @@ class TwitterClient: def _api_get(self, url): # type: (str) -> Dict[str, Any] - """Make authenticated GET request to Twitter API.""" + """Make authenticated GET request to Twitter API with retry on 429.""" headers = self._build_headers() - request = urllib.request.Request(url) - for key, value in headers.items(): - request.add_header(key, value) - try: - with urllib.request.urlopen(request, context=_create_ssl_context(), timeout=30) as response: - payload = response.read().decode("utf-8") - except urllib.error.HTTPError as exc: - body = exc.read().decode("utf-8", errors="replace") - message = "Twitter API error %d: %s" % (exc.code, body[:500]) - raise TwitterAPIError(exc.code, message) - except urllib.error.URLError as exc: - raise TwitterAPIError(0, "Twitter API network error: %s" % exc.reason) + for attempt in range(self._max_retries + 1): + request = urllib.request.Request(url) + for key, value in headers.items(): + request.add_header(key, value) - try: - parsed = json.loads(payload) - except json.JSONDecodeError: - raise TwitterAPIError(0, "Twitter API returned invalid JSON") + try: + with urllib.request.urlopen(request, context=_create_ssl_context(), timeout=30) as response: + payload = response.read().decode("utf-8") + except urllib.error.HTTPError as exc: + if exc.code == 429 and attempt < self._max_retries: + wait = self._retry_base_delay * (2 ** attempt) + logger.warning( + "Rate limited (429), retrying in %.1fs (attempt %d/%d)", + wait, attempt + 1, self._max_retries, + ) + time.sleep(wait) + continue + body = exc.read().decode("utf-8", errors="replace") + message = "Twitter API error %d: %s" % (exc.code, body[:500]) + raise TwitterAPIError(exc.code, message) + except urllib.error.URLError as exc: + raise TwitterAPIError(0, "Twitter API network error: %s" % exc.reason) - if isinstance(parsed, dict) and parsed.get("errors"): - message = parsed["errors"][0].get("message", "Unknown error") - raise TwitterAPIError(0, "Twitter API returned errors: %s" % message) - return parsed + try: + parsed = json.loads(payload) + except json.JSONDecodeError: + raise TwitterAPIError(0, "Twitter API returned invalid JSON") + + if isinstance(parsed, dict) and parsed.get("errors"): + err_msg = parsed["errors"][0].get("message", "Unknown error") + # Rate limit can also surface as a JSON error (code 88) + err_code = parsed["errors"][0].get("code", 0) + if err_code == 88 and attempt < self._max_retries: + wait = self._retry_base_delay * (2 ** attempt) + logger.warning( + "Rate limited (code 88), retrying in %.1fs (attempt %d/%d)", + wait, attempt + 1, self._max_retries, + ) + time.sleep(wait) + continue + raise TwitterAPIError(0, "Twitter API returned errors: %s" % err_msg) + return parsed + + # Should not be reached, but just in case + raise TwitterAPIError(429, "Rate limited after %d retries" % self._max_retries) def _parse_timeline_response(self, data, get_instructions): # type: (Any, Callable[[Any], Any]) -> Tuple[List[Tweet], Optional[str]] diff --git a/twitter_cli/config.py b/twitter_cli/config.py index 819a95b..0f3f6f4 100644 --- a/twitter_cli/config.py +++ b/twitter_cli/config.py @@ -29,6 +29,12 @@ DEFAULT_CONFIG = { "views_log": 0.5, }, }, + "rateLimit": { + "requestDelay": 1.5, + "maxRetries": 3, + "retryBaseDelay": 5.0, + "maxCount": 200, + }, } # type: Dict[str, Any] @@ -128,6 +134,17 @@ def _normalize_config(config): filter_config["weights"] = normalized_weights merged["filter"] = filter_config + # Normalize rateLimit section + rl = merged.get("rateLimit") + if not isinstance(rl, dict): + rl = {} + default_rl = DEFAULT_CONFIG["rateLimit"] + rl["requestDelay"] = max(_as_float(rl.get("requestDelay"), default_rl["requestDelay"]), 0.0) + rl["maxRetries"] = max(_as_int(rl.get("maxRetries"), default_rl["maxRetries"]), 0) + rl["retryBaseDelay"] = max(_as_float(rl.get("retryBaseDelay"), default_rl["retryBaseDelay"]), 1.0) + rl["maxCount"] = max(_as_int(rl.get("maxCount"), default_rl["maxCount"]), 1) + merged["rateLimit"] = rl + return merged