"""Twitter GraphQL API client. Uses the same internal GraphQL endpoint that the Twitter web app uses, authenticated via cookies (auth_token + ct0). QueryId is resolved dynamically using a three-tier strategy. """ from __future__ import annotations import json import logging import math import re import ssl import urllib.request from typing import Any, Callable, Dict, List, Optional, Tuple from .models import Author, Metrics, Tweet, TweetMedia logger = logging.getLogger(__name__) # Public bearer token shared by all Twitter web clients BEARER_TOKEN = ( "AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs" "%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA" ) # Last-resort fallback query IDs FALLBACK_QUERY_IDS = { "HomeTimeline": "HJFjzBgCs16TqxewQOeLNg", "Bookmarks": "VFdMm9iVZxlU6hD86gfW_A", } # Community-maintained API definition (auto-updated daily) TWITTER_OPENAPI_URL = ( "https://raw.githubusercontent.com/fa0311/twitter-openapi/" "main/src/config/placeholder.json" ) # Default features flags required by the GraphQL endpoint FEATURES = { "rweb_tipjar_consumption_enabled": True, "responsive_web_graphql_exclude_directive_enabled": True, "verified_phone_label_enabled": False, "creator_subscriptions_tweet_preview_api_enabled": True, "responsive_web_graphql_timeline_navigation_enabled": True, "responsive_web_graphql_skip_user_profile_image_extensions_enabled": False, "communities_web_enable_tweet_community_results_fetch": True, "c9s_tweet_anatomy_moderator_badge_enabled": True, "articles_preview_enabled": True, "responsive_web_edit_tweet_api_enabled": True, "graphql_is_translatable_rweb_tweet_is_translatable_enabled": True, "view_counts_everywhere_api_enabled": True, "longform_notetweets_consumption_enabled": True, "responsive_web_twitter_article_tweet_consumption_enabled": True, "tweet_awards_web_tipping_enabled": False, "creator_subscriptions_quote_tweet_preview_enabled": False, "freedom_of_speech_not_reach_fetch_enabled": True, "standardized_nudges_misinfo": True, "tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled": True, "rweb_video_timestamps_enabled": True, "longform_notetweets_rich_text_read_enabled": True, "longform_notetweets_inline_media_enabled": True, "responsive_web_enhance_cards_enabled": False, } USER_AGENT = ( "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/131.0.0.0 Safari/537.36" ) # Module-level cache for query IDs _cached_query_ids = {} # type: Dict[str, str] _bundles_scanned = False def _create_ssl_context(): # type: () -> ssl.SSLContext """Create a permissive SSL context for urllib.""" ctx = ssl.create_default_context() return ctx def _url_fetch(url, headers=None): # type: (str, Optional[Dict[str, str]]) -> str """Simple URL fetch using urllib.""" req = urllib.request.Request(url) if headers: for k, v in headers.items(): req.add_header(k, v) ctx = _create_ssl_context() with urllib.request.urlopen(req, context=ctx, timeout=30) as resp: return resp.read().decode("utf-8") def _scan_bundles(): # type: () -> None """Tier 1: Scan Twitter's main-page JS bundles to extract queryId/operationName pairs.""" global _bundles_scanned if _bundles_scanned: return _bundles_scanned = True try: html = _url_fetch("https://x.com", {"user-agent": USER_AGENT}) script_pattern = re.compile( r'(?:src|href)=["\']' r'(https://abs\.twimg\.com/responsive-web/client-web[^"\']+\.js)' r'["\']' ) script_urls = script_pattern.findall(html) for url in script_urls: try: js = _url_fetch(url) op_pattern = re.compile( r'queryId:\s*"([A-Za-z0-9_-]+)"[^}]{0,200}' r'operationName:\s*"([^"]+)"' ) for m in op_pattern.finditer(js): qid, name = m.group(1), m.group(2) if name not in _cached_query_ids: _cached_query_ids[name] = qid except Exception: continue count = len(_cached_query_ids) logger.info("Scanned %d JS bundles, found %d operations", len(script_urls), count) except Exception as e: logger.warning("Failed to scan JS bundles: %s", e) def _fetch_from_github(operation_name): # type: (str) -> Optional[str] """Tier 2: Fetch queryId from community-maintained twitter-openapi.""" try: logger.info("Fetching latest queryId from GitHub (twitter-openapi)...") data_str = _url_fetch(TWITTER_OPENAPI_URL) data = json.loads(data_str) op = data.get(operation_name, {}) qid = op.get("queryId") if qid: logger.info("Found %s queryId from GitHub: %s", operation_name, qid) return qid return None except Exception as e: logger.warning("GitHub lookup failed: %s", e) return None def _resolve_query_id(operation_name): # type: (str) -> str """Resolve queryId using three-tier strategy: bundle scan -> GitHub -> fallback.""" if operation_name in _cached_query_ids: return _cached_query_ids[operation_name] logger.info("Auto-detecting %s queryId...", operation_name) # Tier 1: JS bundle scan _scan_bundles() if operation_name in _cached_query_ids: logger.info("Found %s queryId: %s", operation_name, _cached_query_ids[operation_name]) return _cached_query_ids[operation_name] # Tier 2: GitHub github_id = _fetch_from_github(operation_name) if github_id: _cached_query_ids[operation_name] = github_id return github_id # Tier 3: Hardcoded fallback fallback = FALLBACK_QUERY_IDS.get(operation_name) if fallback: logger.info("Using hardcoded fallback queryId for %s: %s", operation_name, fallback) _cached_query_ids[operation_name] = fallback return fallback raise RuntimeError( 'Cannot resolve queryId for "%s" — all detection methods failed' % operation_name ) class TwitterClient: """Twitter GraphQL API client using cookie authentication.""" def __init__(self, auth_token, ct0): # type: (str, str) -> None self._auth_token = auth_token self._ct0 = ct0 def fetch_home_timeline(self, count=20): # type: (int) -> List[Tweet] """Fetch home timeline tweets.""" query_id = _resolve_query_id("HomeTimeline") return self._fetch_timeline( query_id, "HomeTimeline", count, lambda data: _deep_get(data, "data", "home", "home_timeline_urt", "instructions"), ) def fetch_bookmarks(self, count=50): # type: (int) -> List[Tweet] """Fetch bookmarked tweets.""" query_id = _resolve_query_id("Bookmarks") def get_instructions(data): # type: (Any) -> Any result = _deep_get(data, "data", "bookmark_timeline", "timeline", "instructions") if result is None: result = _deep_get(data, "data", "bookmark_timeline_v2", "timeline", "instructions") return result return self._fetch_timeline(query_id, "Bookmarks", count, get_instructions) def _fetch_timeline(self, query_id, operation_name, count, get_instructions, extra_variables=None): # type: (str, str, int, Callable, Optional[Dict[str, Any]]) -> List[Tweet] """Generic timeline fetcher with pagination and deduplication.""" tweets = [] # type: List[Tweet] cursor = None # type: Optional[str] attempts = 0 max_attempts = int(math.ceil(count / 20.0)) + 2 while len(tweets) < count and attempts < max_attempts: attempts += 1 variables = { "count": min(count - len(tweets) + 5, 40), "includePromotedContent": False, "latestControlAvailable": True, "requestContext": "launch", } # type: Dict[str, Any] if extra_variables: variables.update(extra_variables) if cursor: variables["cursor"] = cursor url = "https://x.com/i/api/graphql/%s/%s?" % (query_id, operation_name) url += "variables=%s&features=%s" % ( urllib.request.quote(json.dumps(variables)), urllib.request.quote(json.dumps(FEATURES)), ) data = self._api_get(url) new_tweets, next_cursor = self._parse_timeline_response(data, get_instructions) seen_ids = {t.id for t in tweets} for tweet in new_tweets: if tweet.id not in seen_ids: tweets.append(tweet) seen_ids.add(tweet.id) if not next_cursor or not new_tweets: break cursor = next_cursor return tweets[:count] def _build_headers(self): # type: () -> Dict[str, str] return { "Authorization": "Bearer %s" % BEARER_TOKEN, "Cookie": "auth_token=%s; ct0=%s" % (self._auth_token, self._ct0), "X-Csrf-Token": self._ct0, "X-Twitter-Active-User": "yes", "X-Twitter-Auth-Type": "OAuth2Session", "X-Twitter-Client-Language": "en", "Content-Type": "application/json", "User-Agent": USER_AGENT, "Referer": "https://x.com/home", "Accept": "*/*", "Accept-Language": "en-US,en;q=0.9", } def _api_get(self, url): # type: (str) -> Any """Make authenticated GET request to Twitter API.""" headers = self._build_headers() req = urllib.request.Request(url) for k, v in headers.items(): req.add_header(k, v) ctx = _create_ssl_context() try: with urllib.request.urlopen(req, context=ctx, timeout=30) as resp: body = resp.read().decode("utf-8") return json.loads(body) except urllib.error.HTTPError as e: body = e.read().decode("utf-8", errors="replace") raise RuntimeError("Twitter API error %d: %s" % (e.code, body[:500])) def _parse_timeline_response(self, data, get_instructions): # type: (Any, Callable) -> Tuple[List[Tweet], Optional[str]] """Parse timeline GraphQL response into tweets + next cursor.""" tweets = [] # type: List[Tweet] next_cursor = None # type: Optional[str] try: instructions = get_instructions(data) if not isinstance(instructions, list): logger.warning("No instructions found in response") return tweets, next_cursor for instruction in instructions: entries = instruction.get("entries") or instruction.get("moduleItems") or [] for entry in entries: content = entry.get("content", {}) # Handle cursor entries if content.get("cursorType") == "Bottom" or content.get("entryType") == "TimelineTimelineCursor": val = content.get("value") if val: next_cursor = val continue # Handle single tweet entries item_content = content.get("itemContent", {}) tweet_results = item_content.get("tweet_results", {}) result = tweet_results.get("result") if result: tweet = self._parse_tweet_result(result) if tweet: tweets.append(tweet) # Handle conversation module (tweet threads) items = content.get("items", []) for item in items: nested = ( item.get("item", {}) .get("itemContent", {}) .get("tweet_results", {}) .get("result") ) if nested: tweet = self._parse_tweet_result(nested) if tweet: tweets.append(tweet) except Exception as e: logger.warning("Error parsing timeline response: %s", e) return tweets, next_cursor def _parse_tweet_result(self, result): # type: (Dict[str, Any]) -> Optional[Tweet] """Parse a single TweetResult from GraphQL response.""" try: tweet_data = result # Handle TweetWithVisibilityResults wrapper if result.get("__typename") == "TweetWithVisibilityResults" and result.get("tweet"): tweet_data = result["tweet"] if tweet_data.get("__typename") == "TweetTombstone": return None if not tweet_data.get("legacy") or not tweet_data.get("core"): return None legacy = tweet_data["legacy"] user = tweet_data["core"]["user_results"]["result"] user_legacy = user.get("legacy", {}) user_core = user.get("core", {}) # Check if this is a retweet is_retweet = bool(legacy.get("retweeted_status_result", {}).get("result")) actual_data = tweet_data actual_legacy = legacy actual_user = user actual_user_legacy = user_legacy if is_retweet: rt_result = legacy["retweeted_status_result"]["result"] # Handle wrapped retweet if rt_result.get("__typename") == "TweetWithVisibilityResults" and rt_result.get("tweet"): rt_result = rt_result["tweet"] if rt_result.get("legacy") and rt_result.get("core"): actual_data = rt_result actual_legacy = rt_result["legacy"] actual_user = rt_result["core"]["user_results"]["result"] actual_user_legacy = actual_user.get("legacy", {}) # Parse media media = [] # type: List[TweetMedia] ext_media = actual_legacy.get("extended_entities", {}).get("media", []) for m in ext_media: m_type = m.get("type", "") if m_type == "photo": media.append(TweetMedia( type="photo", url=m.get("media_url_https", ""), width=_deep_get(m, "original_info", "width"), height=_deep_get(m, "original_info", "height"), )) elif m_type in ("video", "animated_gif"): variants = m.get("video_info", {}).get("variants", []) mp4_variants = [v for v in variants if v.get("content_type") == "video/mp4"] mp4_variants.sort(key=lambda v: v.get("bitrate", 0), reverse=True) video_url = mp4_variants[0]["url"] if mp4_variants else m.get("media_url_https", "") media.append(TweetMedia( type=m_type, url=video_url, width=_deep_get(m, "original_info", "width"), height=_deep_get(m, "original_info", "height"), )) # Parse URLs urls = [u.get("expanded_url", "") for u in actual_legacy.get("entities", {}).get("urls", [])] # Parse quoted tweet quoted_tweet = None # type: Optional[Tweet] quoted_result = actual_data.get("quoted_status_result", {}).get("result") if quoted_result: quoted_tweet = self._parse_tweet_result(quoted_result) # Extract user info — try user.core (new API), then user.legacy (old API) au = actual_user aul = actual_user_legacy auc = au.get("core", {}) user_name = auc.get("name") or aul.get("name") or au.get("name", "Unknown") user_screen_name = auc.get("screen_name") or aul.get("screen_name") or au.get("screen_name", "unknown") user_profile_image = au.get("avatar", {}).get("image_url") or aul.get("profile_image_url_https", "") user_verified = au.get("is_blue_verified") or aul.get("verified", False) # Retweeted by info rt_screen_name = None # type: Optional[str] if is_retweet: rt_screen_name = user_core.get("screen_name") or user_legacy.get("screen_name", "unknown") return Tweet( id=actual_data.get("rest_id", ""), text=actual_legacy.get("full_text", ""), author=Author( id=au.get("rest_id", ""), name=user_name, screen_name=user_screen_name, profile_image_url=user_profile_image, verified=bool(user_verified), ), metrics=Metrics( likes=actual_legacy.get("favorite_count", 0), retweets=actual_legacy.get("retweet_count", 0), replies=actual_legacy.get("reply_count", 0), quotes=actual_legacy.get("quote_count", 0), views=int(actual_data.get("views", {}).get("count", "0") or "0"), bookmarks=actual_legacy.get("bookmark_count", 0), ), created_at=actual_legacy.get("created_at", ""), media=media, urls=urls, is_retweet=is_retweet, retweeted_by=rt_screen_name, quoted_tweet=quoted_tweet, lang=actual_legacy.get("lang", ""), ) except Exception as e: logger.warning("Failed to parse tweet: %s", e) return None def _deep_get(d, *keys): # type: (Any, *str) -> Any """Safely get a nested value from a dict.""" for key in keys: if isinstance(d, dict): d = d.get(key) else: return None return d