feat: add rate limiting, retry with backoff, and max count cap

- Add configurable request delay between paginated API calls (default 1.5s)
- Add retry with exponential backoff on HTTP 429 and Twitter error code 88
- Add hard max count cap (default 200, absolute ceiling 500)
- Add rateLimit config section with requestDelay, maxRetries, retryBaseDelay, maxCount
- Add normalization tests for rateLimit config
This commit is contained in:
jackwener
2026-03-07 19:02:49 +08:00
parent 0f26e20abb
commit 55c48b077b
6 changed files with 125 additions and 31 deletions

View File

@@ -13,3 +13,9 @@ filter:
replies: 2.0 replies: 2.0
bookmarks: 5.0 bookmarks: 5.0
views_log: 0.5 views_log: 0.5
rateLimit:
requestDelay: 1.5 # seconds between paginated requests
maxRetries: 3 # retry count on 429 / rate-limit errors
retryBaseDelay: 5.0 # base delay for exponential backoff (seconds)
maxCount: 200 # hard cap for single fetch

View File

@@ -12,7 +12,7 @@ def test_cli_user_command_works_with_client_factory(monkeypatch) -> None:
def fetch_user(self, screen_name: str) -> UserProfile: def fetch_user(self, screen_name: str) -> UserProfile:
return UserProfile(id="1", name="Alice", screen_name=screen_name) return UserProfile(id="1", name="Alice", screen_name=screen_name)
monkeypatch.setattr("twitter_cli.cli._get_client", lambda: FakeClient()) monkeypatch.setattr("twitter_cli.cli._get_client", lambda config=None: FakeClient())
runner = CliRunner() runner = CliRunner()
result = runner.invoke(cli, ["user", "alice"]) result = runner.invoke(cli, ["user", "alice"])
assert result.exit_code == 0 assert result.exit_code == 0

View File

@@ -33,3 +33,30 @@ def test_filter_normalization_for_invalid_values(tmp_path: Path) -> None:
assert config["filter"]["lang"] == [] assert config["filter"]["lang"] == []
assert config["filter"]["weights"]["likes"] == 1.0 assert config["filter"]["weights"]["likes"] == 1.0
assert config["filter"]["weights"]["retweets"] == 4.0 assert config["filter"]["weights"]["retweets"] == 4.0
# rateLimit should get defaults since it wasn't in the yaml
assert config["rateLimit"]["requestDelay"] == 1.5
assert config["rateLimit"]["maxRetries"] == 3
assert config["rateLimit"]["retryBaseDelay"] == 5.0
assert config["rateLimit"]["maxCount"] == 200
def test_rate_limit_normalization(tmp_path: Path) -> None:
config_file = tmp_path / "config.yaml"
config_file.write_text(
"\n".join(
[
"rateLimit:",
" requestDelay: -2",
" maxRetries: bad",
" retryBaseDelay: 0.1",
" maxCount: 0",
]
),
encoding="utf-8",
)
config = load_config(str(config_file))
assert config["rateLimit"]["requestDelay"] == 0.0 # clamped to >= 0
assert config["rateLimit"]["maxRetries"] == 3 # fallback to default
assert config["rateLimit"]["retryBaseDelay"] == 1.0 # clamped to >= 1.0
assert config["rateLimit"]["maxCount"] == 1 # clamped to >= 1

View File

@@ -60,15 +60,16 @@ def _load_tweets_from_json(path):
raise RuntimeError("Invalid tweet JSON file %s: %s" % (path, exc)) raise RuntimeError("Invalid tweet JSON file %s: %s" % (path, exc))
def _get_client(): def _get_client(config=None):
# type: () -> TwitterClient # type: (Optional[Dict[str, Any]]) -> TwitterClient
"""Create an authenticated API client.""" """Create an authenticated API client."""
console.print("\n🔐 Getting Twitter cookies...") console.print("\n🔐 Getting Twitter cookies...")
try: try:
cookies = get_cookies() cookies = get_cookies()
except RuntimeError as exc: except RuntimeError as exc:
raise RuntimeError(str(exc)) raise RuntimeError(str(exc))
return TwitterClient(cookies["auth_token"], cookies["ct0"]) rate_limit_config = (config or {}).get("rateLimit")
return TwitterClient(cookies["auth_token"], cookies["ct0"], rate_limit_config)
def _resolve_fetch_count(max_count, configured): def _resolve_fetch_count(max_count, configured):
@@ -128,7 +129,7 @@ def feed(feed_type, max_count, as_json, input_file, output_file, do_filter):
console.print(" Loaded %d tweets" % len(tweets)) console.print(" Loaded %d tweets" % len(tweets))
else: else:
fetch_count = _resolve_fetch_count(max_count, config.get("fetch", {}).get("count", 50)) fetch_count = _resolve_fetch_count(max_count, config.get("fetch", {}).get("count", 50))
client = _get_client() client = _get_client(config)
label = "following feed" if feed_type == "following" else "home timeline" label = "following feed" if feed_type == "following" else "home timeline"
console.print("📡 Fetching %s (%d tweets)...\n" % (label, fetch_count)) console.print("📡 Fetching %s (%d tweets)...\n" % (label, fetch_count))
start = time.time() start = time.time()
@@ -169,7 +170,7 @@ def favorite(max_count, as_json, output_file, do_filter):
config = load_config() config = load_config()
try: try:
fetch_count = _resolve_fetch_count(max_count, config.get("fetch", {}).get("count", 50)) fetch_count = _resolve_fetch_count(max_count, config.get("fetch", {}).get("count", 50))
client = _get_client() client = _get_client(config)
console.print("🔖 Fetching favorites (%d tweets)...\n" % fetch_count) console.print("🔖 Fetching favorites (%d tweets)...\n" % fetch_count)
start = time.time() start = time.time()
tweets = client.fetch_bookmarks(fetch_count) tweets = client.fetch_bookmarks(fetch_count)
@@ -199,8 +200,9 @@ def user(screen_name):
# type: (str,) -> None # type: (str,) -> None
"""View a user's profile. SCREEN_NAME is the @handle (without @).""" """View a user's profile. SCREEN_NAME is the @handle (without @)."""
screen_name = screen_name.lstrip("@") screen_name = screen_name.lstrip("@")
config = load_config()
try: try:
client = _get_client() client = _get_client(config)
console.print("👤 Fetching user @%s..." % screen_name) console.print("👤 Fetching user @%s..." % screen_name)
profile = client.fetch_user(screen_name) profile = client.fetch_user(screen_name)
except RuntimeError as exc: except RuntimeError as exc:
@@ -219,9 +221,10 @@ def user_posts(screen_name, max_count, as_json):
# type: (str, int, bool) -> None # type: (str, int, bool) -> None
"""List a user's tweets. SCREEN_NAME is the @handle (without @).""" """List a user's tweets. SCREEN_NAME is the @handle (without @)."""
screen_name = screen_name.lstrip("@") screen_name = screen_name.lstrip("@")
config = load_config()
try: try:
fetch_count = _resolve_fetch_count(max_count, 20) fetch_count = _resolve_fetch_count(max_count, 20)
client = _get_client() client = _get_client(config)
console.print("👤 Fetching @%s's profile..." % screen_name) console.print("👤 Fetching @%s's profile..." % screen_name)
profile = client.fetch_user(screen_name) profile = client.fetch_user(screen_name)
console.print("📝 Fetching tweets (%d)...\n" % fetch_count) console.print("📝 Fetching tweets (%d)...\n" % fetch_count)

View File

@@ -6,6 +6,7 @@ import json
import logging import logging
import math import math
import re import re
import time
import ssl import ssl
import urllib.error import urllib.error
import urllib.parse import urllib.parse
@@ -201,13 +202,22 @@ def _resolve_query_id(operation_name, prefer_fallback=True):
raise RuntimeError('Cannot resolve queryId for "%s"' % operation_name) raise RuntimeError('Cannot resolve queryId for "%s"' % operation_name)
# Hard ceiling to prevent accidental massive fetches
_ABSOLUTE_MAX_COUNT = 500
class TwitterClient: class TwitterClient:
"""Twitter GraphQL API client using cookie authentication.""" """Twitter GraphQL API client using cookie authentication."""
def __init__(self, auth_token, ct0): def __init__(self, auth_token, ct0, rate_limit_config=None):
# type: (str, str) -> None # type: (str, str, Optional[Dict[str, Any]]) -> None
self._auth_token = auth_token self._auth_token = auth_token
self._ct0 = ct0 self._ct0 = ct0
rl = rate_limit_config or {}
self._request_delay = float(rl.get("requestDelay", 1.5))
self._max_retries = int(rl.get("maxRetries", 3))
self._retry_base_delay = float(rl.get("retryBaseDelay", 5.0))
self._max_count = min(int(rl.get("maxCount", 200)), _ABSOLUTE_MAX_COUNT)
def fetch_home_timeline(self, count=20): def fetch_home_timeline(self, count=20):
# type: (int) -> List[Tweet] # type: (int) -> List[Tweet]
@@ -308,6 +318,9 @@ class TwitterClient:
if count <= 0: if count <= 0:
return [] return []
# Enforce max count cap
count = min(count, self._max_count)
tweets = [] # type: List[Tweet] tweets = [] # type: List[Tweet]
seen_ids = set() # type: Set[str] seen_ids = set() # type: Set[str]
cursor = None # type: Optional[str] cursor = None # type: Optional[str]
@@ -339,6 +352,11 @@ class TwitterClient:
break break
cursor = next_cursor cursor = next_cursor
# Rate-limit: sleep between paginated requests
if len(tweets) < count and self._request_delay > 0:
logger.debug("Sleeping %.1fs between requests", self._request_delay)
time.sleep(self._request_delay)
return tweets[:count] return tweets[:count]
def _graphql_get(self, operation_name, variables, features): def _graphql_get(self, operation_name, variables, features):
@@ -379,31 +397,54 @@ class TwitterClient:
def _api_get(self, url): def _api_get(self, url):
# type: (str) -> Dict[str, Any] # type: (str) -> Dict[str, Any]
"""Make authenticated GET request to Twitter API.""" """Make authenticated GET request to Twitter API with retry on 429."""
headers = self._build_headers() headers = self._build_headers()
request = urllib.request.Request(url)
for key, value in headers.items():
request.add_header(key, value)
try: for attempt in range(self._max_retries + 1):
with urllib.request.urlopen(request, context=_create_ssl_context(), timeout=30) as response: request = urllib.request.Request(url)
payload = response.read().decode("utf-8") for key, value in headers.items():
except urllib.error.HTTPError as exc: request.add_header(key, value)
body = exc.read().decode("utf-8", errors="replace")
message = "Twitter API error %d: %s" % (exc.code, body[:500])
raise TwitterAPIError(exc.code, message)
except urllib.error.URLError as exc:
raise TwitterAPIError(0, "Twitter API network error: %s" % exc.reason)
try: try:
parsed = json.loads(payload) with urllib.request.urlopen(request, context=_create_ssl_context(), timeout=30) as response:
except json.JSONDecodeError: payload = response.read().decode("utf-8")
raise TwitterAPIError(0, "Twitter API returned invalid JSON") except urllib.error.HTTPError as exc:
if exc.code == 429 and attempt < self._max_retries:
wait = self._retry_base_delay * (2 ** attempt)
logger.warning(
"Rate limited (429), retrying in %.1fs (attempt %d/%d)",
wait, attempt + 1, self._max_retries,
)
time.sleep(wait)
continue
body = exc.read().decode("utf-8", errors="replace")
message = "Twitter API error %d: %s" % (exc.code, body[:500])
raise TwitterAPIError(exc.code, message)
except urllib.error.URLError as exc:
raise TwitterAPIError(0, "Twitter API network error: %s" % exc.reason)
if isinstance(parsed, dict) and parsed.get("errors"): try:
message = parsed["errors"][0].get("message", "Unknown error") parsed = json.loads(payload)
raise TwitterAPIError(0, "Twitter API returned errors: %s" % message) except json.JSONDecodeError:
return parsed raise TwitterAPIError(0, "Twitter API returned invalid JSON")
if isinstance(parsed, dict) and parsed.get("errors"):
err_msg = parsed["errors"][0].get("message", "Unknown error")
# Rate limit can also surface as a JSON error (code 88)
err_code = parsed["errors"][0].get("code", 0)
if err_code == 88 and attempt < self._max_retries:
wait = self._retry_base_delay * (2 ** attempt)
logger.warning(
"Rate limited (code 88), retrying in %.1fs (attempt %d/%d)",
wait, attempt + 1, self._max_retries,
)
time.sleep(wait)
continue
raise TwitterAPIError(0, "Twitter API returned errors: %s" % err_msg)
return parsed
# Should not be reached, but just in case
raise TwitterAPIError(429, "Rate limited after %d retries" % self._max_retries)
def _parse_timeline_response(self, data, get_instructions): def _parse_timeline_response(self, data, get_instructions):
# type: (Any, Callable[[Any], Any]) -> Tuple[List[Tweet], Optional[str]] # type: (Any, Callable[[Any], Any]) -> Tuple[List[Tweet], Optional[str]]

View File

@@ -29,6 +29,12 @@ DEFAULT_CONFIG = {
"views_log": 0.5, "views_log": 0.5,
}, },
}, },
"rateLimit": {
"requestDelay": 1.5,
"maxRetries": 3,
"retryBaseDelay": 5.0,
"maxCount": 200,
},
} # type: Dict[str, Any] } # type: Dict[str, Any]
@@ -128,6 +134,17 @@ def _normalize_config(config):
filter_config["weights"] = normalized_weights filter_config["weights"] = normalized_weights
merged["filter"] = filter_config merged["filter"] = filter_config
# Normalize rateLimit section
rl = merged.get("rateLimit")
if not isinstance(rl, dict):
rl = {}
default_rl = DEFAULT_CONFIG["rateLimit"]
rl["requestDelay"] = max(_as_float(rl.get("requestDelay"), default_rl["requestDelay"]), 0.0)
rl["maxRetries"] = max(_as_int(rl.get("maxRetries"), default_rl["maxRetries"]), 0)
rl["retryBaseDelay"] = max(_as_float(rl.get("retryBaseDelay"), default_rl["retryBaseDelay"]), 1.0)
rl["maxCount"] = max(_as_int(rl.get("maxCount"), default_rl["maxCount"]), 1)
merged["rateLimit"] = rl
return merged return merged