From 55c48b077b213813d744ab13e9dafe921aa68282 Mon Sep 17 00:00:00 2001
From: jackwener <jakevingoo@gmail.com>
Date: Sat, 7 Mar 2026 19:02:49 +0800
Subject: [PATCH] feat: add rate limiting, retry with backoff, and max count
 cap

- Add configurable request delay between paginated API calls (default 1.5s)
- Add retry with exponential backoff on HTTP 429 and Twitter error code 88
- Add hard max count cap (default 200, absolute ceiling 500)
- Add rateLimit config section with requestDelay, maxRetries, retryBaseDelay, maxCount
- Add normalization tests for rateLimit config
---
 config.yaml                        |  6 +++
 tests/test_cli.py                  |  2 +-
 tests/test_config_normalization.py | 27 ++++++++++
 twitter_cli/cli.py                 | 17 +++---
 twitter_cli/client.py              | 87 ++++++++++++++++++++++--------
 twitter_cli/config.py              | 17 ++++++
 6 files changed, 125 insertions(+), 31 deletions(-)

diff --git a/config.yaml b/config.yaml
index 2649084..0a62952 100644
--- a/config.yaml
+++ b/config.yaml
@@ -13,3 +13,9 @@ filter:
     replies: 2.0
     bookmarks: 5.0
     views_log: 0.5
+
+rateLimit:
+  requestDelay: 1.5    # seconds between paginated requests
+  maxRetries: 3        # retry count on 429 / rate-limit errors
+  retryBaseDelay: 5.0  # base delay for exponential backoff (seconds)
+  maxCount: 200        # hard cap for single fetch
diff --git a/tests/test_cli.py b/tests/test_cli.py
index d818867..baf87d9 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -12,7 +12,7 @@ def test_cli_user_command_works_with_client_factory(monkeypatch) -> None:
         def fetch_user(self, screen_name: str) -> UserProfile:
             return UserProfile(id="1", name="Alice", screen_name=screen_name)
 
-    monkeypatch.setattr("twitter_cli.cli._get_client", lambda: FakeClient())
+    monkeypatch.setattr("twitter_cli.cli._get_client", lambda config=None: FakeClient())
     runner = CliRunner()
     result = runner.invoke(cli, ["user", "alice"])
     assert result.exit_code == 0
diff --git a/tests/test_config_normalization.py b/tests/test_config_normalization.py
index a9a2d08..450e0bc 100644
--- a/tests/test_config_normalization.py
+++ b/tests/test_config_normalization.py
@@ -33,3 +33,30 @@ def test_filter_normalization_for_invalid_values(tmp_path: Path) -> None:
     assert config["filter"]["lang"] == []
     assert config["filter"]["weights"]["likes"] == 1.0
     assert config["filter"]["weights"]["retweets"] == 4.0
+    # rateLimit should get defaults since it wasn't in the yaml
+    assert config["rateLimit"]["requestDelay"] == 1.5
+    assert config["rateLimit"]["maxRetries"] == 3
+    assert config["rateLimit"]["retryBaseDelay"] == 5.0
+    assert config["rateLimit"]["maxCount"] == 200
+
+
+def test_rate_limit_normalization(tmp_path: Path) -> None:
+    config_file = tmp_path / "config.yaml"
+    config_file.write_text(
+        "\n".join(
+            [
+                "rateLimit:",
+                "  requestDelay: -2",
+                "  maxRetries: bad",
+                "  retryBaseDelay: 0.1",
+                "  maxCount: 0",
+            ]
+        ),
+        encoding="utf-8",
+    )
+
+    config = load_config(str(config_file))
+    assert config["rateLimit"]["requestDelay"] == 0.0  # clamped to >= 0
+    assert config["rateLimit"]["maxRetries"] == 3  # fallback to default
+    assert config["rateLimit"]["retryBaseDelay"] == 1.0  # clamped to >= 1.0
+    assert config["rateLimit"]["maxCount"] == 1  # clamped to >= 1
diff --git a/twitter_cli/cli.py b/twitter_cli/cli.py
index 6c83f96..1bb7149 100644
--- a/twitter_cli/cli.py
+++ b/twitter_cli/cli.py
@@ -60,15 +60,16 @@ def _load_tweets_from_json(path):
         raise RuntimeError("Invalid tweet JSON file %s: %s" % (path, exc))
 
 
-def _get_client():
-    # type: () -> TwitterClient
+def _get_client(config=None):
+    # type: (Optional[Dict[str, Any]]) -> TwitterClient
     """Create an authenticated API client."""
     console.print("\n🔐 Getting Twitter cookies...")
     try:
         cookies = get_cookies()
     except RuntimeError as exc:
         raise RuntimeError(str(exc))
-    return TwitterClient(cookies["auth_token"], cookies["ct0"])
+    rate_limit_config = (config or {}).get("rateLimit")
+    return TwitterClient(cookies["auth_token"], cookies["ct0"], rate_limit_config)
 
 
 def _resolve_fetch_count(max_count, configured):
@@ -128,7 +129,7 @@ def feed(feed_type, max_count, as_json, input_file, output_file, do_filter):
             console.print("   Loaded %d tweets" % len(tweets))
         else:
             fetch_count = _resolve_fetch_count(max_count, config.get("fetch", {}).get("count", 50))
-            client = _get_client()
+            client = _get_client(config)
             label = "following feed" if feed_type == "following" else "home timeline"
             console.print("📡 Fetching %s (%d tweets)...\n" % (label, fetch_count))
             start = time.time()
@@ -169,7 +170,7 @@ def favorite(max_count, as_json, output_file, do_filter):
     config = load_config()
     try:
         fetch_count = _resolve_fetch_count(max_count, config.get("fetch", {}).get("count", 50))
-        client = _get_client()
+        client = _get_client(config)
         console.print("🔖 Fetching favorites (%d tweets)...\n" % fetch_count)
         start = time.time()
         tweets = client.fetch_bookmarks(fetch_count)
@@ -199,8 +200,9 @@ def user(screen_name):
     # type: (str,) -> None
     """View a user's profile. SCREEN_NAME is the @handle (without @)."""
     screen_name = screen_name.lstrip("@")
+    config = load_config()
     try:
-        client = _get_client()
+        client = _get_client(config)
         console.print("👤 Fetching user @%s..." % screen_name)
         profile = client.fetch_user(screen_name)
     except RuntimeError as exc:
@@ -219,9 +221,10 @@ def user_posts(screen_name, max_count, as_json):
     # type: (str, int, bool) -> None
     """List a user's tweets. SCREEN_NAME is the @handle (without @)."""
     screen_name = screen_name.lstrip("@")
+    config = load_config()
     try:
         fetch_count = _resolve_fetch_count(max_count, 20)
-        client = _get_client()
+        client = _get_client(config)
         console.print("👤 Fetching @%s's profile..." % screen_name)
         profile = client.fetch_user(screen_name)
         console.print("📝 Fetching tweets (%d)...\n" % fetch_count)
diff --git a/twitter_cli/client.py b/twitter_cli/client.py
index fd208b3..cbc6231 100644
--- a/twitter_cli/client.py
+++ b/twitter_cli/client.py
@@ -6,6 +6,7 @@ import json
 import logging
 import math
 import re
+import time
 import ssl
 import urllib.error
 import urllib.parse
@@ -201,13 +202,22 @@ def _resolve_query_id(operation_name, prefer_fallback=True):
     raise RuntimeError('Cannot resolve queryId for "%s"' % operation_name)
 
 
+# Hard ceiling to prevent accidental massive fetches
+_ABSOLUTE_MAX_COUNT = 500
+
+
 class TwitterClient:
     """Twitter GraphQL API client using cookie authentication."""
 
-    def __init__(self, auth_token, ct0):
-        # type: (str, str) -> None
+    def __init__(self, auth_token, ct0, rate_limit_config=None):
+        # type: (str, str, Optional[Dict[str, Any]]) -> None
         self._auth_token = auth_token
         self._ct0 = ct0
+        rl = rate_limit_config or {}
+        self._request_delay = float(rl.get("requestDelay", 1.5))
+        self._max_retries = int(rl.get("maxRetries", 3))
+        self._retry_base_delay = float(rl.get("retryBaseDelay", 5.0))
+        self._max_count = min(int(rl.get("maxCount", 200)), _ABSOLUTE_MAX_COUNT)
 
     def fetch_home_timeline(self, count=20):
         # type: (int) -> List[Tweet]
@@ -308,6 +318,9 @@ class TwitterClient:
         if count <= 0:
             return []
 
+        # Enforce max count cap
+        count = min(count, self._max_count)
+
         tweets = []  # type: List[Tweet]
         seen_ids = set()  # type: Set[str]
         cursor = None  # type: Optional[str]
@@ -339,6 +352,11 @@ class TwitterClient:
                 break
             cursor = next_cursor
 
+            # Rate-limit: sleep between paginated requests
+            if len(tweets) < count and self._request_delay > 0:
+                logger.debug("Sleeping %.1fs between requests", self._request_delay)
+                time.sleep(self._request_delay)
+
         return tweets[:count]
 
     def _graphql_get(self, operation_name, variables, features):
@@ -379,31 +397,54 @@ class TwitterClient:
 
     def _api_get(self, url):
         # type: (str) -> Dict[str, Any]
-        """Make authenticated GET request to Twitter API."""
+        """Make authenticated GET request to Twitter API with retry on 429."""
         headers = self._build_headers()
-        request = urllib.request.Request(url)
-        for key, value in headers.items():
-            request.add_header(key, value)
 
-        try:
-            with urllib.request.urlopen(request, context=_create_ssl_context(), timeout=30) as response:
-                payload = response.read().decode("utf-8")
-        except urllib.error.HTTPError as exc:
-            body = exc.read().decode("utf-8", errors="replace")
-            message = "Twitter API error %d: %s" % (exc.code, body[:500])
-            raise TwitterAPIError(exc.code, message)
-        except urllib.error.URLError as exc:
-            raise TwitterAPIError(0, "Twitter API network error: %s" % exc.reason)
+        for attempt in range(self._max_retries + 1):
+            request = urllib.request.Request(url)
+            for key, value in headers.items():
+                request.add_header(key, value)
 
-        try:
-            parsed = json.loads(payload)
-        except json.JSONDecodeError:
-            raise TwitterAPIError(0, "Twitter API returned invalid JSON")
+            try:
+                with urllib.request.urlopen(request, context=_create_ssl_context(), timeout=30) as response:
+                    payload = response.read().decode("utf-8")
+            except urllib.error.HTTPError as exc:
+                if exc.code == 429 and attempt < self._max_retries:
+                    wait = self._retry_base_delay * (2 ** attempt)
+                    logger.warning(
+                        "Rate limited (429), retrying in %.1fs (attempt %d/%d)",
+                        wait, attempt + 1, self._max_retries,
+                    )
+                    time.sleep(wait)
+                    continue
+                body = exc.read().decode("utf-8", errors="replace")
+                message = "Twitter API error %d: %s" % (exc.code, body[:500])
+                raise TwitterAPIError(exc.code, message)
+            except urllib.error.URLError as exc:
+                raise TwitterAPIError(0, "Twitter API network error: %s" % exc.reason)
 
-        if isinstance(parsed, dict) and parsed.get("errors"):
-            message = parsed["errors"][0].get("message", "Unknown error")
-            raise TwitterAPIError(0, "Twitter API returned errors: %s" % message)
-        return parsed
+            try:
+                parsed = json.loads(payload)
+            except json.JSONDecodeError:
+                raise TwitterAPIError(0, "Twitter API returned invalid JSON")
+
+            if isinstance(parsed, dict) and parsed.get("errors"):
+                err_msg = parsed["errors"][0].get("message", "Unknown error")
+                # Rate limit can also surface as a JSON error (code 88)
+                err_code = parsed["errors"][0].get("code", 0)
+                if err_code == 88 and attempt < self._max_retries:
+                    wait = self._retry_base_delay * (2 ** attempt)
+                    logger.warning(
+                        "Rate limited (code 88), retrying in %.1fs (attempt %d/%d)",
+                        wait, attempt + 1, self._max_retries,
+                    )
+                    time.sleep(wait)
+                    continue
+                raise TwitterAPIError(0, "Twitter API returned errors: %s" % err_msg)
+            return parsed
+
+        # Should not be reached, but just in case
+        raise TwitterAPIError(429, "Rate limited after %d retries" % self._max_retries)
 
     def _parse_timeline_response(self, data, get_instructions):
         # type: (Any, Callable[[Any], Any]) -> Tuple[List[Tweet], Optional[str]]
diff --git a/twitter_cli/config.py b/twitter_cli/config.py
index 819a95b..0f3f6f4 100644
--- a/twitter_cli/config.py
+++ b/twitter_cli/config.py
@@ -29,6 +29,12 @@ DEFAULT_CONFIG = {
             "views_log": 0.5,
         },
     },
+    "rateLimit": {
+        "requestDelay": 1.5,
+        "maxRetries": 3,
+        "retryBaseDelay": 5.0,
+        "maxCount": 200,
+    },
 }  # type: Dict[str, Any]
 
 
@@ -128,6 +134,17 @@ def _normalize_config(config):
     filter_config["weights"] = normalized_weights
     merged["filter"] = filter_config
 
+    # Normalize rateLimit section
+    rl = merged.get("rateLimit")
+    if not isinstance(rl, dict):
+        rl = {}
+    default_rl = DEFAULT_CONFIG["rateLimit"]
+    rl["requestDelay"] = max(_as_float(rl.get("requestDelay"), default_rl["requestDelay"]), 0.0)
+    rl["maxRetries"] = max(_as_int(rl.get("maxRetries"), default_rl["maxRetries"]), 0)
+    rl["retryBaseDelay"] = max(_as_float(rl.get("retryBaseDelay"), default_rl["retryBaseDelay"]), 1.0)
+    rl["maxCount"] = max(_as_int(rl.get("maxCount"), default_rl["maxCount"]), 1)
+    merged["rateLimit"] = rl
+
     return merged