feat: add rate limiting, retry with backoff, and max count cap
- Add configurable request delay between paginated API calls (default 1.5s) - Add retry with exponential backoff on HTTP 429 and Twitter error code 88 - Add hard max count cap (default 200, absolute ceiling 500) - Add rateLimit config section with requestDelay, maxRetries, retryBaseDelay, maxCount - Add normalization tests for rateLimit config
This commit is contained in:
@@ -13,3 +13,9 @@ filter:
|
|||||||
replies: 2.0
|
replies: 2.0
|
||||||
bookmarks: 5.0
|
bookmarks: 5.0
|
||||||
views_log: 0.5
|
views_log: 0.5
|
||||||
|
|
||||||
|
rateLimit:
|
||||||
|
requestDelay: 1.5 # seconds between paginated requests
|
||||||
|
maxRetries: 3 # retry count on 429 / rate-limit errors
|
||||||
|
retryBaseDelay: 5.0 # base delay for exponential backoff (seconds)
|
||||||
|
maxCount: 200 # hard cap for single fetch
|
||||||
|
|||||||
@@ -12,7 +12,7 @@ def test_cli_user_command_works_with_client_factory(monkeypatch) -> None:
|
|||||||
def fetch_user(self, screen_name: str) -> UserProfile:
|
def fetch_user(self, screen_name: str) -> UserProfile:
|
||||||
return UserProfile(id="1", name="Alice", screen_name=screen_name)
|
return UserProfile(id="1", name="Alice", screen_name=screen_name)
|
||||||
|
|
||||||
monkeypatch.setattr("twitter_cli.cli._get_client", lambda: FakeClient())
|
monkeypatch.setattr("twitter_cli.cli._get_client", lambda config=None: FakeClient())
|
||||||
runner = CliRunner()
|
runner = CliRunner()
|
||||||
result = runner.invoke(cli, ["user", "alice"])
|
result = runner.invoke(cli, ["user", "alice"])
|
||||||
assert result.exit_code == 0
|
assert result.exit_code == 0
|
||||||
|
|||||||
@@ -33,3 +33,30 @@ def test_filter_normalization_for_invalid_values(tmp_path: Path) -> None:
|
|||||||
assert config["filter"]["lang"] == []
|
assert config["filter"]["lang"] == []
|
||||||
assert config["filter"]["weights"]["likes"] == 1.0
|
assert config["filter"]["weights"]["likes"] == 1.0
|
||||||
assert config["filter"]["weights"]["retweets"] == 4.0
|
assert config["filter"]["weights"]["retweets"] == 4.0
|
||||||
|
# rateLimit should get defaults since it wasn't in the yaml
|
||||||
|
assert config["rateLimit"]["requestDelay"] == 1.5
|
||||||
|
assert config["rateLimit"]["maxRetries"] == 3
|
||||||
|
assert config["rateLimit"]["retryBaseDelay"] == 5.0
|
||||||
|
assert config["rateLimit"]["maxCount"] == 200
|
||||||
|
|
||||||
|
|
||||||
|
def test_rate_limit_normalization(tmp_path: Path) -> None:
|
||||||
|
config_file = tmp_path / "config.yaml"
|
||||||
|
config_file.write_text(
|
||||||
|
"\n".join(
|
||||||
|
[
|
||||||
|
"rateLimit:",
|
||||||
|
" requestDelay: -2",
|
||||||
|
" maxRetries: bad",
|
||||||
|
" retryBaseDelay: 0.1",
|
||||||
|
" maxCount: 0",
|
||||||
|
]
|
||||||
|
),
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
|
||||||
|
config = load_config(str(config_file))
|
||||||
|
assert config["rateLimit"]["requestDelay"] == 0.0 # clamped to >= 0
|
||||||
|
assert config["rateLimit"]["maxRetries"] == 3 # fallback to default
|
||||||
|
assert config["rateLimit"]["retryBaseDelay"] == 1.0 # clamped to >= 1.0
|
||||||
|
assert config["rateLimit"]["maxCount"] == 1 # clamped to >= 1
|
||||||
|
|||||||
@@ -60,15 +60,16 @@ def _load_tweets_from_json(path):
|
|||||||
raise RuntimeError("Invalid tweet JSON file %s: %s" % (path, exc))
|
raise RuntimeError("Invalid tweet JSON file %s: %s" % (path, exc))
|
||||||
|
|
||||||
|
|
||||||
def _get_client():
|
def _get_client(config=None):
|
||||||
# type: () -> TwitterClient
|
# type: (Optional[Dict[str, Any]]) -> TwitterClient
|
||||||
"""Create an authenticated API client."""
|
"""Create an authenticated API client."""
|
||||||
console.print("\n🔐 Getting Twitter cookies...")
|
console.print("\n🔐 Getting Twitter cookies...")
|
||||||
try:
|
try:
|
||||||
cookies = get_cookies()
|
cookies = get_cookies()
|
||||||
except RuntimeError as exc:
|
except RuntimeError as exc:
|
||||||
raise RuntimeError(str(exc))
|
raise RuntimeError(str(exc))
|
||||||
return TwitterClient(cookies["auth_token"], cookies["ct0"])
|
rate_limit_config = (config or {}).get("rateLimit")
|
||||||
|
return TwitterClient(cookies["auth_token"], cookies["ct0"], rate_limit_config)
|
||||||
|
|
||||||
|
|
||||||
def _resolve_fetch_count(max_count, configured):
|
def _resolve_fetch_count(max_count, configured):
|
||||||
@@ -128,7 +129,7 @@ def feed(feed_type, max_count, as_json, input_file, output_file, do_filter):
|
|||||||
console.print(" Loaded %d tweets" % len(tweets))
|
console.print(" Loaded %d tweets" % len(tweets))
|
||||||
else:
|
else:
|
||||||
fetch_count = _resolve_fetch_count(max_count, config.get("fetch", {}).get("count", 50))
|
fetch_count = _resolve_fetch_count(max_count, config.get("fetch", {}).get("count", 50))
|
||||||
client = _get_client()
|
client = _get_client(config)
|
||||||
label = "following feed" if feed_type == "following" else "home timeline"
|
label = "following feed" if feed_type == "following" else "home timeline"
|
||||||
console.print("📡 Fetching %s (%d tweets)...\n" % (label, fetch_count))
|
console.print("📡 Fetching %s (%d tweets)...\n" % (label, fetch_count))
|
||||||
start = time.time()
|
start = time.time()
|
||||||
@@ -169,7 +170,7 @@ def favorite(max_count, as_json, output_file, do_filter):
|
|||||||
config = load_config()
|
config = load_config()
|
||||||
try:
|
try:
|
||||||
fetch_count = _resolve_fetch_count(max_count, config.get("fetch", {}).get("count", 50))
|
fetch_count = _resolve_fetch_count(max_count, config.get("fetch", {}).get("count", 50))
|
||||||
client = _get_client()
|
client = _get_client(config)
|
||||||
console.print("🔖 Fetching favorites (%d tweets)...\n" % fetch_count)
|
console.print("🔖 Fetching favorites (%d tweets)...\n" % fetch_count)
|
||||||
start = time.time()
|
start = time.time()
|
||||||
tweets = client.fetch_bookmarks(fetch_count)
|
tweets = client.fetch_bookmarks(fetch_count)
|
||||||
@@ -199,8 +200,9 @@ def user(screen_name):
|
|||||||
# type: (str,) -> None
|
# type: (str,) -> None
|
||||||
"""View a user's profile. SCREEN_NAME is the @handle (without @)."""
|
"""View a user's profile. SCREEN_NAME is the @handle (without @)."""
|
||||||
screen_name = screen_name.lstrip("@")
|
screen_name = screen_name.lstrip("@")
|
||||||
|
config = load_config()
|
||||||
try:
|
try:
|
||||||
client = _get_client()
|
client = _get_client(config)
|
||||||
console.print("👤 Fetching user @%s..." % screen_name)
|
console.print("👤 Fetching user @%s..." % screen_name)
|
||||||
profile = client.fetch_user(screen_name)
|
profile = client.fetch_user(screen_name)
|
||||||
except RuntimeError as exc:
|
except RuntimeError as exc:
|
||||||
@@ -219,9 +221,10 @@ def user_posts(screen_name, max_count, as_json):
|
|||||||
# type: (str, int, bool) -> None
|
# type: (str, int, bool) -> None
|
||||||
"""List a user's tweets. SCREEN_NAME is the @handle (without @)."""
|
"""List a user's tweets. SCREEN_NAME is the @handle (without @)."""
|
||||||
screen_name = screen_name.lstrip("@")
|
screen_name = screen_name.lstrip("@")
|
||||||
|
config = load_config()
|
||||||
try:
|
try:
|
||||||
fetch_count = _resolve_fetch_count(max_count, 20)
|
fetch_count = _resolve_fetch_count(max_count, 20)
|
||||||
client = _get_client()
|
client = _get_client(config)
|
||||||
console.print("👤 Fetching @%s's profile..." % screen_name)
|
console.print("👤 Fetching @%s's profile..." % screen_name)
|
||||||
profile = client.fetch_user(screen_name)
|
profile = client.fetch_user(screen_name)
|
||||||
console.print("📝 Fetching tweets (%d)...\n" % fetch_count)
|
console.print("📝 Fetching tweets (%d)...\n" % fetch_count)
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ import json
|
|||||||
import logging
|
import logging
|
||||||
import math
|
import math
|
||||||
import re
|
import re
|
||||||
|
import time
|
||||||
import ssl
|
import ssl
|
||||||
import urllib.error
|
import urllib.error
|
||||||
import urllib.parse
|
import urllib.parse
|
||||||
@@ -201,13 +202,22 @@ def _resolve_query_id(operation_name, prefer_fallback=True):
|
|||||||
raise RuntimeError('Cannot resolve queryId for "%s"' % operation_name)
|
raise RuntimeError('Cannot resolve queryId for "%s"' % operation_name)
|
||||||
|
|
||||||
|
|
||||||
|
# Hard ceiling to prevent accidental massive fetches
|
||||||
|
_ABSOLUTE_MAX_COUNT = 500
|
||||||
|
|
||||||
|
|
||||||
class TwitterClient:
|
class TwitterClient:
|
||||||
"""Twitter GraphQL API client using cookie authentication."""
|
"""Twitter GraphQL API client using cookie authentication."""
|
||||||
|
|
||||||
def __init__(self, auth_token, ct0):
|
def __init__(self, auth_token, ct0, rate_limit_config=None):
|
||||||
# type: (str, str) -> None
|
# type: (str, str, Optional[Dict[str, Any]]) -> None
|
||||||
self._auth_token = auth_token
|
self._auth_token = auth_token
|
||||||
self._ct0 = ct0
|
self._ct0 = ct0
|
||||||
|
rl = rate_limit_config or {}
|
||||||
|
self._request_delay = float(rl.get("requestDelay", 1.5))
|
||||||
|
self._max_retries = int(rl.get("maxRetries", 3))
|
||||||
|
self._retry_base_delay = float(rl.get("retryBaseDelay", 5.0))
|
||||||
|
self._max_count = min(int(rl.get("maxCount", 200)), _ABSOLUTE_MAX_COUNT)
|
||||||
|
|
||||||
def fetch_home_timeline(self, count=20):
|
def fetch_home_timeline(self, count=20):
|
||||||
# type: (int) -> List[Tweet]
|
# type: (int) -> List[Tweet]
|
||||||
@@ -308,6 +318,9 @@ class TwitterClient:
|
|||||||
if count <= 0:
|
if count <= 0:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
# Enforce max count cap
|
||||||
|
count = min(count, self._max_count)
|
||||||
|
|
||||||
tweets = [] # type: List[Tweet]
|
tweets = [] # type: List[Tweet]
|
||||||
seen_ids = set() # type: Set[str]
|
seen_ids = set() # type: Set[str]
|
||||||
cursor = None # type: Optional[str]
|
cursor = None # type: Optional[str]
|
||||||
@@ -339,6 +352,11 @@ class TwitterClient:
|
|||||||
break
|
break
|
||||||
cursor = next_cursor
|
cursor = next_cursor
|
||||||
|
|
||||||
|
# Rate-limit: sleep between paginated requests
|
||||||
|
if len(tweets) < count and self._request_delay > 0:
|
||||||
|
logger.debug("Sleeping %.1fs between requests", self._request_delay)
|
||||||
|
time.sleep(self._request_delay)
|
||||||
|
|
||||||
return tweets[:count]
|
return tweets[:count]
|
||||||
|
|
||||||
def _graphql_get(self, operation_name, variables, features):
|
def _graphql_get(self, operation_name, variables, features):
|
||||||
@@ -379,31 +397,54 @@ class TwitterClient:
|
|||||||
|
|
||||||
def _api_get(self, url):
|
def _api_get(self, url):
|
||||||
# type: (str) -> Dict[str, Any]
|
# type: (str) -> Dict[str, Any]
|
||||||
"""Make authenticated GET request to Twitter API."""
|
"""Make authenticated GET request to Twitter API with retry on 429."""
|
||||||
headers = self._build_headers()
|
headers = self._build_headers()
|
||||||
request = urllib.request.Request(url)
|
|
||||||
for key, value in headers.items():
|
|
||||||
request.add_header(key, value)
|
|
||||||
|
|
||||||
try:
|
for attempt in range(self._max_retries + 1):
|
||||||
with urllib.request.urlopen(request, context=_create_ssl_context(), timeout=30) as response:
|
request = urllib.request.Request(url)
|
||||||
payload = response.read().decode("utf-8")
|
for key, value in headers.items():
|
||||||
except urllib.error.HTTPError as exc:
|
request.add_header(key, value)
|
||||||
body = exc.read().decode("utf-8", errors="replace")
|
|
||||||
message = "Twitter API error %d: %s" % (exc.code, body[:500])
|
|
||||||
raise TwitterAPIError(exc.code, message)
|
|
||||||
except urllib.error.URLError as exc:
|
|
||||||
raise TwitterAPIError(0, "Twitter API network error: %s" % exc.reason)
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
parsed = json.loads(payload)
|
with urllib.request.urlopen(request, context=_create_ssl_context(), timeout=30) as response:
|
||||||
except json.JSONDecodeError:
|
payload = response.read().decode("utf-8")
|
||||||
raise TwitterAPIError(0, "Twitter API returned invalid JSON")
|
except urllib.error.HTTPError as exc:
|
||||||
|
if exc.code == 429 and attempt < self._max_retries:
|
||||||
|
wait = self._retry_base_delay * (2 ** attempt)
|
||||||
|
logger.warning(
|
||||||
|
"Rate limited (429), retrying in %.1fs (attempt %d/%d)",
|
||||||
|
wait, attempt + 1, self._max_retries,
|
||||||
|
)
|
||||||
|
time.sleep(wait)
|
||||||
|
continue
|
||||||
|
body = exc.read().decode("utf-8", errors="replace")
|
||||||
|
message = "Twitter API error %d: %s" % (exc.code, body[:500])
|
||||||
|
raise TwitterAPIError(exc.code, message)
|
||||||
|
except urllib.error.URLError as exc:
|
||||||
|
raise TwitterAPIError(0, "Twitter API network error: %s" % exc.reason)
|
||||||
|
|
||||||
if isinstance(parsed, dict) and parsed.get("errors"):
|
try:
|
||||||
message = parsed["errors"][0].get("message", "Unknown error")
|
parsed = json.loads(payload)
|
||||||
raise TwitterAPIError(0, "Twitter API returned errors: %s" % message)
|
except json.JSONDecodeError:
|
||||||
return parsed
|
raise TwitterAPIError(0, "Twitter API returned invalid JSON")
|
||||||
|
|
||||||
|
if isinstance(parsed, dict) and parsed.get("errors"):
|
||||||
|
err_msg = parsed["errors"][0].get("message", "Unknown error")
|
||||||
|
# Rate limit can also surface as a JSON error (code 88)
|
||||||
|
err_code = parsed["errors"][0].get("code", 0)
|
||||||
|
if err_code == 88 and attempt < self._max_retries:
|
||||||
|
wait = self._retry_base_delay * (2 ** attempt)
|
||||||
|
logger.warning(
|
||||||
|
"Rate limited (code 88), retrying in %.1fs (attempt %d/%d)",
|
||||||
|
wait, attempt + 1, self._max_retries,
|
||||||
|
)
|
||||||
|
time.sleep(wait)
|
||||||
|
continue
|
||||||
|
raise TwitterAPIError(0, "Twitter API returned errors: %s" % err_msg)
|
||||||
|
return parsed
|
||||||
|
|
||||||
|
# Should not be reached, but just in case
|
||||||
|
raise TwitterAPIError(429, "Rate limited after %d retries" % self._max_retries)
|
||||||
|
|
||||||
def _parse_timeline_response(self, data, get_instructions):
|
def _parse_timeline_response(self, data, get_instructions):
|
||||||
# type: (Any, Callable[[Any], Any]) -> Tuple[List[Tweet], Optional[str]]
|
# type: (Any, Callable[[Any], Any]) -> Tuple[List[Tweet], Optional[str]]
|
||||||
|
|||||||
@@ -29,6 +29,12 @@ DEFAULT_CONFIG = {
|
|||||||
"views_log": 0.5,
|
"views_log": 0.5,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
"rateLimit": {
|
||||||
|
"requestDelay": 1.5,
|
||||||
|
"maxRetries": 3,
|
||||||
|
"retryBaseDelay": 5.0,
|
||||||
|
"maxCount": 200,
|
||||||
|
},
|
||||||
} # type: Dict[str, Any]
|
} # type: Dict[str, Any]
|
||||||
|
|
||||||
|
|
||||||
@@ -128,6 +134,17 @@ def _normalize_config(config):
|
|||||||
filter_config["weights"] = normalized_weights
|
filter_config["weights"] = normalized_weights
|
||||||
merged["filter"] = filter_config
|
merged["filter"] = filter_config
|
||||||
|
|
||||||
|
# Normalize rateLimit section
|
||||||
|
rl = merged.get("rateLimit")
|
||||||
|
if not isinstance(rl, dict):
|
||||||
|
rl = {}
|
||||||
|
default_rl = DEFAULT_CONFIG["rateLimit"]
|
||||||
|
rl["requestDelay"] = max(_as_float(rl.get("requestDelay"), default_rl["requestDelay"]), 0.0)
|
||||||
|
rl["maxRetries"] = max(_as_int(rl.get("maxRetries"), default_rl["maxRetries"]), 0)
|
||||||
|
rl["retryBaseDelay"] = max(_as_float(rl.get("retryBaseDelay"), default_rl["retryBaseDelay"]), 1.0)
|
||||||
|
rl["maxCount"] = max(_as_int(rl.get("maxCount"), default_rl["maxCount"]), 1)
|
||||||
|
merged["rateLimit"] = rl
|
||||||
|
|
||||||
return merged
|
return merged
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user