feat: add rate limiting, retry with backoff, and max count cap

- Add configurable request delay between paginated API calls (default 1.5s) - Add retry with exponential backoff on HTTP 429 and Twitter error code 88 - Add hard max count cap (default 200, absolute ceiling 500) - Add rateLimit config section with requestDelay, maxRetries, retryBaseDelay, maxCount - Add normalization tests for rateLimit config
2026-03-07 19:02:49 +08:00
parent 0f26e20abb
commit 55c48b077b
6 changed files with 125 additions and 31 deletions
@@ -13,3 +13,9 @@ filter:
    replies: 2.0
    bookmarks: 5.0
    views_log: 0.5
 rateLimit:
  requestDelay: 1.5    # seconds between paginated requests
  maxRetries: 3        # retry count on 429 / rate-limit errors
  retryBaseDelay: 5.0  # base delay for exponential backoff (seconds)
  maxCount: 200        # hard cap for single fetch
@@ -12,7 +12,7 @@ def test_cli_user_command_works_with_client_factory(monkeypatch) -> None:
        def fetch_user(self, screen_name: str) -> UserProfile:
            return UserProfile(id="1", name="Alice", screen_name=screen_name)
-    monkeypatch.setattr("twitter_cli.cli._get_client", lambda: FakeClient())
+    monkeypatch.setattr("twitter_cli.cli._get_client", lambda config=None: FakeClient())
    runner = CliRunner()
    result = runner.invoke(cli, ["user", "alice"])
    assert result.exit_code == 0
@@ -33,3 +33,30 @@ def test_filter_normalization_for_invalid_values(tmp_path: Path) -> None:
    assert config["filter"]["lang"] == []
    assert config["filter"]["weights"]["likes"] == 1.0
    assert config["filter"]["weights"]["retweets"] == 4.0
    # rateLimit should get defaults since it wasn't in the yaml
    assert config["rateLimit"]["requestDelay"] == 1.5
    assert config["rateLimit"]["maxRetries"] == 3
    assert config["rateLimit"]["retryBaseDelay"] == 5.0
    assert config["rateLimit"]["maxCount"] == 200
 def test_rate_limit_normalization(tmp_path: Path) -> None:
    config_file = tmp_path / "config.yaml"
    config_file.write_text(
        "\n".join(
            [
                "rateLimit:",
                "  requestDelay: -2",
                "  maxRetries: bad",
                "  retryBaseDelay: 0.1",
                "  maxCount: 0",
            ]
        ),
        encoding="utf-8",
    )
    config = load_config(str(config_file))
    assert config["rateLimit"]["requestDelay"] == 0.0  # clamped to >= 0
    assert config["rateLimit"]["maxRetries"] == 3  # fallback to default
    assert config["rateLimit"]["retryBaseDelay"] == 1.0  # clamped to >= 1.0
    assert config["rateLimit"]["maxCount"] == 1  # clamped to >= 1
@@ -60,15 +60,16 @@ def _load_tweets_from_json(path):
        raise RuntimeError("Invalid tweet JSON file %s: %s" % (path, exc))
-def _get_client():
+def _get_client(config=None):
-    # type: () -> TwitterClient
+    # type: (Optional[Dict[str, Any]]) -> TwitterClient
    """Create an authenticated API client."""
    console.print("\n🔐 Getting Twitter cookies...")
    try:
        cookies = get_cookies()
    except RuntimeError as exc:
        raise RuntimeError(str(exc))
-    return TwitterClient(cookies["auth_token"], cookies["ct0"])
+    rate_limit_config = (config or {}).get("rateLimit")
    return TwitterClient(cookies["auth_token"], cookies["ct0"], rate_limit_config)
 def _resolve_fetch_count(max_count, configured):
@@ -128,7 +129,7 @@ def feed(feed_type, max_count, as_json, input_file, output_file, do_filter):
            console.print("   Loaded %d tweets" % len(tweets))
        else:
            fetch_count = _resolve_fetch_count(max_count, config.get("fetch", {}).get("count", 50))
-            client = _get_client()
+            client = _get_client(config)
            label = "following feed" if feed_type == "following" else "home timeline"
            console.print("📡 Fetching %s (%d tweets)...\n" % (label, fetch_count))
            start = time.time()
@@ -169,7 +170,7 @@ def favorite(max_count, as_json, output_file, do_filter):
    config = load_config()
    try:
        fetch_count = _resolve_fetch_count(max_count, config.get("fetch", {}).get("count", 50))
-        client = _get_client()
+        client = _get_client(config)
        console.print("🔖 Fetching favorites (%d tweets)...\n" % fetch_count)
        start = time.time()
        tweets = client.fetch_bookmarks(fetch_count)
@@ -199,8 +200,9 @@ def user(screen_name):
    # type: (str,) -> None
    """View a user's profile. SCREEN_NAME is the @handle (without @)."""
    screen_name = screen_name.lstrip("@")
    config = load_config()
    try:
-        client = _get_client()
+        client = _get_client(config)
        console.print("👤 Fetching user @%s..." % screen_name)
        profile = client.fetch_user(screen_name)
    except RuntimeError as exc:
@@ -219,9 +221,10 @@ def user_posts(screen_name, max_count, as_json):
    # type: (str, int, bool) -> None
    """List a user's tweets. SCREEN_NAME is the @handle (without @)."""
    screen_name = screen_name.lstrip("@")
    config = load_config()
    try:
        fetch_count = _resolve_fetch_count(max_count, 20)
-        client = _get_client()
+        client = _get_client(config)
        console.print("👤 Fetching @%s's profile..." % screen_name)
        profile = client.fetch_user(screen_name)
        console.print("📝 Fetching tweets (%d)...\n" % fetch_count)
@@ -6,6 +6,7 @@ import json
 import logging
 import math
 import re
 import time
 import ssl
 import urllib.error
 import urllib.parse
@@ -201,13 +202,22 @@ def _resolve_query_id(operation_name, prefer_fallback=True):
    raise RuntimeError('Cannot resolve queryId for "%s"' % operation_name)
 # Hard ceiling to prevent accidental massive fetches
 _ABSOLUTE_MAX_COUNT = 500
 class TwitterClient:
    """Twitter GraphQL API client using cookie authentication."""
-    def __init__(self, auth_token, ct0):
+    def __init__(self, auth_token, ct0, rate_limit_config=None):
-        # type: (str, str) -> None
+        # type: (str, str, Optional[Dict[str, Any]]) -> None
        self._auth_token = auth_token
        self._ct0 = ct0
        rl = rate_limit_config or {}
        self._request_delay = float(rl.get("requestDelay", 1.5))
        self._max_retries = int(rl.get("maxRetries", 3))
        self._retry_base_delay = float(rl.get("retryBaseDelay", 5.0))
        self._max_count = min(int(rl.get("maxCount", 200)), _ABSOLUTE_MAX_COUNT)
    def fetch_home_timeline(self, count=20):
        # type: (int) -> List[Tweet]
@@ -308,6 +318,9 @@ class TwitterClient:
        if count <= 0:
            return []
        # Enforce max count cap
        count = min(count, self._max_count)
        tweets = []  # type: List[Tweet]
        seen_ids = set()  # type: Set[str]
        cursor = None  # type: Optional[str]
@@ -339,6 +352,11 @@ class TwitterClient:
                break
            cursor = next_cursor
            # Rate-limit: sleep between paginated requests
            if len(tweets) < count and self._request_delay > 0:
                logger.debug("Sleeping %.1fs between requests", self._request_delay)
                time.sleep(self._request_delay)
        return tweets[:count]
    def _graphql_get(self, operation_name, variables, features):
@@ -379,31 +397,54 @@ class TwitterClient:
    def _api_get(self, url):
        # type: (str) -> Dict[str, Any]
-        """Make authenticated GET request to Twitter API."""
+        """Make authenticated GET request to Twitter API with retry on 429."""
        headers = self._build_headers()
        request = urllib.request.Request(url)
        for key, value in headers.items():
            request.add_header(key, value)
-        try:
+        for attempt in range(self._max_retries + 1):
-            with urllib.request.urlopen(request, context=_create_ssl_context(), timeout=30) as response:
+            request = urllib.request.Request(url)
-                payload = response.read().decode("utf-8")
+            for key, value in headers.items():
-        except urllib.error.HTTPError as exc:
+                request.add_header(key, value)
            body = exc.read().decode("utf-8", errors="replace")
            message = "Twitter API error %d: %s" % (exc.code, body[:500])
            raise TwitterAPIError(exc.code, message)
        except urllib.error.URLError as exc:
            raise TwitterAPIError(0, "Twitter API network error: %s" % exc.reason)
-        try:
+            try:
-            parsed = json.loads(payload)
+                with urllib.request.urlopen(request, context=_create_ssl_context(), timeout=30) as response:
-        except json.JSONDecodeError:
+                    payload = response.read().decode("utf-8")
-            raise TwitterAPIError(0, "Twitter API returned invalid JSON")
+            except urllib.error.HTTPError as exc:
                if exc.code == 429 and attempt < self._max_retries:
                    wait = self._retry_base_delay * (2 ** attempt)
                    logger.warning(
                        "Rate limited (429), retrying in %.1fs (attempt %d/%d)",
                        wait, attempt + 1, self._max_retries,
                    )
                    time.sleep(wait)
                    continue
                body = exc.read().decode("utf-8", errors="replace")
                message = "Twitter API error %d: %s" % (exc.code, body[:500])
                raise TwitterAPIError(exc.code, message)
            except urllib.error.URLError as exc:
                raise TwitterAPIError(0, "Twitter API network error: %s" % exc.reason)
-        if isinstance(parsed, dict) and parsed.get("errors"):
+            try:
-            message = parsed["errors"][0].get("message", "Unknown error")
+                parsed = json.loads(payload)
-            raise TwitterAPIError(0, "Twitter API returned errors: %s" % message)
+            except json.JSONDecodeError:
-        return parsed
+                raise TwitterAPIError(0, "Twitter API returned invalid JSON")
            if isinstance(parsed, dict) and parsed.get("errors"):
                err_msg = parsed["errors"][0].get("message", "Unknown error")
                # Rate limit can also surface as a JSON error (code 88)
                err_code = parsed["errors"][0].get("code", 0)
                if err_code == 88 and attempt < self._max_retries:
                    wait = self._retry_base_delay * (2 ** attempt)
                    logger.warning(
                        "Rate limited (code 88), retrying in %.1fs (attempt %d/%d)",
                        wait, attempt + 1, self._max_retries,
                    )
                    time.sleep(wait)
                    continue
                raise TwitterAPIError(0, "Twitter API returned errors: %s" % err_msg)
            return parsed
        # Should not be reached, but just in case
        raise TwitterAPIError(429, "Rate limited after %d retries" % self._max_retries)
    def _parse_timeline_response(self, data, get_instructions):
        # type: (Any, Callable[[Any], Any]) -> Tuple[List[Tweet], Optional[str]]
@@ -29,6 +29,12 @@ DEFAULT_CONFIG = {
            "views_log": 0.5,
        },
    },
    "rateLimit": {
        "requestDelay": 1.5,
        "maxRetries": 3,
        "retryBaseDelay": 5.0,
        "maxCount": 200,
    },
 }  # type: Dict[str, Any]
@@ -128,6 +134,17 @@ def _normalize_config(config):
    filter_config["weights"] = normalized_weights
    merged["filter"] = filter_config
    # Normalize rateLimit section
    rl = merged.get("rateLimit")
    if not isinstance(rl, dict):
        rl = {}
    default_rl = DEFAULT_CONFIG["rateLimit"]
    rl["requestDelay"] = max(_as_float(rl.get("requestDelay"), default_rl["requestDelay"]), 0.0)
    rl["maxRetries"] = max(_as_int(rl.get("maxRetries"), default_rl["maxRetries"]), 0)
    rl["retryBaseDelay"] = max(_as_float(rl.get("retryBaseDelay"), default_rl["retryBaseDelay"]), 1.0)
    rl["maxCount"] = max(_as_int(rl.get("maxCount"), default_rl["maxCount"]), 1)
    merged["rateLimit"] = rl
    return merged