diff --git a/tests/test_client.py b/tests/test_client.py index b26098c..18005bd 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -24,8 +24,11 @@ from twitter_cli.graphql import ( ) from twitter_cli.parser import ( _deep_get, + _extract_atomic_markdown, _extract_cursor, _extract_media, + _normalize_article_entity_map, + _parse_article, _parse_int, parse_tweet_result, parse_user_result, @@ -414,6 +417,144 @@ class TestPaginationBehavior: assert [user.screen_name for user in users] == ["alice"] +# ── Article parsing helpers ─────────────────────────────────────────────── + +class TestNormalizeArticleEntityMap: + def test_accepts_dict_entity_map(self): + entity_map = {0: {"type": "MARKDOWN"}, "1": {"type": "LINK"}} + + normalized = _normalize_article_entity_map(entity_map) + + assert normalized == {"0": {"type": "MARKDOWN"}, "1": {"type": "LINK"}} + + def test_accepts_list_entity_map(self): + entity_map = [ + {"key": "4", "value": {"type": "MARKDOWN", "data": {"markdown": "```md\nhi\n```"}}}, + {"key": 5, "value": {"type": "LINK", "data": {"url": "https://example.com"}}}, + ] + + normalized = _normalize_article_entity_map(entity_map) + + assert normalized == { + "4": {"type": "MARKDOWN", "data": {"markdown": "```md\nhi\n```"}}, + "5": {"type": "LINK", "data": {"url": "https://example.com"}}, + } + + def test_rejects_unknown_shapes(self): + assert _normalize_article_entity_map(None) == {} + assert _normalize_article_entity_map("bad") == {} + + +class TestExtractAtomicMarkdown: + def test_extracts_markdown_entity(self): + block = {"entityRanges": [{"key": 4}]} + entity_map = { + "4": {"type": "MARKDOWN", "data": {"markdown": "```markdown\nconst answer = 42;\n```"}} + } + + assert _extract_atomic_markdown(block, entity_map) == ["```markdown\nconst answer = 42;\n```"] + + def test_ignores_non_markdown_entities(self): + block = {"entityRanges": [{"key": 0}, {"key": 1}]} + entity_map = { + "0": {"type": "MEDIA", "data": {"mediaItems": []}}, + "1": {"type": "LINK", "data": {"url": "https://example.com"}}, + } + + assert _extract_atomic_markdown(block, entity_map) == [] + + def test_ignores_blank_markdown(self): + block = {"entityRanges": [{"key": 4}]} + entity_map = {"4": {"type": "MARKDOWN", "data": {"markdown": " \n"}}} + + assert _extract_atomic_markdown(block, entity_map) == [] + + +class TestParseArticle: + def test_preserves_atomic_markdown_between_text_blocks(self): + result = { + "article": { + "article_results": { + "result": { + "title": "Article title", + "content_state": { + "blocks": [ + {"key": "a", "type": "unstyled", "text": "Intro", "entityRanges": []}, + {"key": "b", "type": "atomic", "text": " ", "entityRanges": [{"offset": 0, "length": 1, "key": 4}]}, + {"key": "c", "type": "unstyled", "text": "Outro", "entityRanges": []}, + ], + "entityMap": [ + { + "key": "4", + "value": { + "type": "MARKDOWN", + "data": {"markdown": "```markdown\nconst answer = 42;\n```"}, + }, + } + ], + }, + } + } + } + } + + parsed = _parse_article(result) + + assert parsed == { + "article_title": "Article title", + "article_text": "Intro\n\n```markdown\nconst answer = 42;\n```\n\nOutro", + } + + def test_hooeem_like_payload_keeps_multiple_markdown_blocks(self): + result = { + "article": { + "article_results": { + "result": { + "title": "I want to become a Claude architect (full course).", + "content_state": { + "blocks": [ + {"key": "a", "type": "unstyled", "text": "If you have no idea how to get started go to Claude and paste this prompt which will help you with domain 1:", "entityRanges": []}, + {"key": "b", "type": "atomic", "text": " ", "entityRanges": [{"offset": 0, "length": 1, "key": 4}]}, + {"key": "c", "type": "unstyled", "text": "What to build to learn: A multi-tool agent with 3-4 MCP tools.", "entityRanges": []}, + {"key": "d", "type": "atomic", "text": " ", "entityRanges": [{"offset": 0, "length": 1, "key": 5}]}, + {"key": "e", "type": "unstyled", "text": "Done.", "entityRanges": []}, + ], + "entityMap": [ + { + "key": "4", + "value": { + "type": "MARKDOWN", + "data": {"markdown": "```markdown\nYou are an expert instructor teaching Domain 1.\n```"}, + }, + }, + { + "key": "5", + "value": { + "type": "MARKDOWN", + "data": {"markdown": "```markdown\nBest for: predictable, structured tasks like code reviews.\n```"}, + }, + }, + ], + }, + } + } + } + } + + parsed = _parse_article(result) + + assert parsed == { + "article_title": "I want to become a Claude architect (full course).", + "article_text": ( + "If you have no idea how to get started go to Claude and paste this prompt which will help you with domain 1:\n\n" + "```markdown\nYou are an expert instructor teaching Domain 1.\n```\n\n" + "What to build to learn: A multi-tool agent with 3-4 MCP tools.\n\n" + "```markdown\nBest for: predictable, structured tasks like code reviews.\n```\n\n" + "Done." + ), + } + + # ── TwitterClient._parse_tweet_result ───────────────────────────────────── class TestParseTweetResult: @@ -514,6 +655,7 @@ class TestParseTweetResult: assert parse_tweet_result(self.SAMPLE_TWEET_RESULT, depth=3) is None + # ── TwitterAPIError ────────────────────────────────────────────────────── class TestTwitterAPIError: diff --git a/twitter_cli/parser.py b/twitter_cli/parser.py index bb964a8..563c1b0 100644 --- a/twitter_cli/parser.py +++ b/twitter_cli/parser.py @@ -113,6 +113,46 @@ def _extract_author(user_data, user_legacy): # ── Article parsing ────────────────────────────────────────────────────── +def _normalize_article_entity_map(entity_map): + # type: (Any) -> Dict[str, Any] + """Normalize Draft.js entityMap that may arrive as dict or [{key, value}, ...].""" + if isinstance(entity_map, dict): + return {str(key): value for key, value in entity_map.items()} + if isinstance(entity_map, list): + normalized = {} # type: Dict[str, Any] + for item in entity_map: + if not isinstance(item, dict): + continue + key = item.get("key") + value = item.get("value") + if key is None or value is None: + continue + normalized[str(key)] = value + return normalized + return {} + + + +def _extract_atomic_markdown(block, entity_map): + # type: (Dict[str, Any], Dict[str, Any]) -> List[str] + """Extract embedded markdown/code payloads from atomic Draft.js entities.""" + parts = [] # type: List[str] + for entity_range in block.get("entityRanges", []) or []: + if not isinstance(entity_range, dict): + continue + entity_key = entity_range.get("key") + entity = entity_map.get(str(entity_key)) if entity_key is not None else None + if not isinstance(entity, dict): + continue + if str(entity.get("type") or "").upper() != "MARKDOWN": + continue + markdown = _deep_get(entity, "data", "markdown") + if isinstance(markdown, str) and markdown.strip(): + parts.append(markdown.strip()) + return parts + + + def _parse_article(tweet_data): # type: (Dict[str, Any]) -> Dict[str, Any] """Extract Twitter Article data (long-form content) from a tweet. @@ -130,12 +170,16 @@ def _parse_article(tweet_data): if not blocks: return {"article_title": title, "article_text": None} + entity_map = _normalize_article_entity_map(content_state.get("entityMap", {})) + # Convert draft.js blocks to Markdown parts = [] # type: List[str] ordered_counter = 0 for block in blocks: block_type = block.get("type", "unstyled") # type: str if block_type == "atomic": + parts.extend(_extract_atomic_markdown(block, entity_map)) + ordered_counter = 0 continue text = block.get("text", "") # type: str if not text: