feat: preserve article atomic markdown blocks (#37)

* fix: preserve atomic markdown blocks in articles * test: add parser unit coverage for article markdown blocks
2026-03-17 18:06:02 +08:00
parent 8cb5824ed4
commit 90f0635c50
2 changed files with 186 additions and 0 deletions
--- a/tests/test_client.py
+++ b/tests/test_client.py
@@ -24,8 +24,11 @@ from twitter_cli.graphql import (
 )
 from twitter_cli.parser import (
    _deep_get,
+    _extract_atomic_markdown,
    _extract_cursor,
    _extract_media,
+    _normalize_article_entity_map,
+    _parse_article,
    _parse_int,
    parse_tweet_result,
    parse_user_result,
@@ -414,6 +417,144 @@ class TestPaginationBehavior:
        assert [user.screen_name for user in users] == ["alice"]


+# ── Article parsing helpers ───────────────────────────────────────────────
+
+class TestNormalizeArticleEntityMap:
+    def test_accepts_dict_entity_map(self):
+        entity_map = {0: {"type": "MARKDOWN"}, "1": {"type": "LINK"}}
+
+        normalized = _normalize_article_entity_map(entity_map)
+
+        assert normalized == {"0": {"type": "MARKDOWN"}, "1": {"type": "LINK"}}
+
+    def test_accepts_list_entity_map(self):
+        entity_map = [
+            {"key": "4", "value": {"type": "MARKDOWN", "data": {"markdown": "```md\nhi\n```"}}},
+            {"key": 5, "value": {"type": "LINK", "data": {"url": "https://example.com"}}},
+        ]
+
+        normalized = _normalize_article_entity_map(entity_map)
+
+        assert normalized == {
+            "4": {"type": "MARKDOWN", "data": {"markdown": "```md\nhi\n```"}},
+            "5": {"type": "LINK", "data": {"url": "https://example.com"}},
+        }
+
+    def test_rejects_unknown_shapes(self):
+        assert _normalize_article_entity_map(None) == {}
+        assert _normalize_article_entity_map("bad") == {}
+
+
+class TestExtractAtomicMarkdown:
+    def test_extracts_markdown_entity(self):
+        block = {"entityRanges": [{"key": 4}]}
+        entity_map = {
+            "4": {"type": "MARKDOWN", "data": {"markdown": "```markdown\nconst answer = 42;\n```"}}
+        }
+
+        assert _extract_atomic_markdown(block, entity_map) == ["```markdown\nconst answer = 42;\n```"]
+
+    def test_ignores_non_markdown_entities(self):
+        block = {"entityRanges": [{"key": 0}, {"key": 1}]}
+        entity_map = {
+            "0": {"type": "MEDIA", "data": {"mediaItems": []}},
+            "1": {"type": "LINK", "data": {"url": "https://example.com"}},
+        }
+
+        assert _extract_atomic_markdown(block, entity_map) == []
+
+    def test_ignores_blank_markdown(self):
+        block = {"entityRanges": [{"key": 4}]}
+        entity_map = {"4": {"type": "MARKDOWN", "data": {"markdown": "   \n"}}}
+
+        assert _extract_atomic_markdown(block, entity_map) == []
+
+
+class TestParseArticle:
+    def test_preserves_atomic_markdown_between_text_blocks(self):
+        result = {
+            "article": {
+                "article_results": {
+                    "result": {
+                        "title": "Article title",
+                        "content_state": {
+                            "blocks": [
+                                {"key": "a", "type": "unstyled", "text": "Intro", "entityRanges": []},
+                                {"key": "b", "type": "atomic", "text": " ", "entityRanges": [{"offset": 0, "length": 1, "key": 4}]},
+                                {"key": "c", "type": "unstyled", "text": "Outro", "entityRanges": []},
+                            ],
+                            "entityMap": [
+                                {
+                                    "key": "4",
+                                    "value": {
+                                        "type": "MARKDOWN",
+                                        "data": {"markdown": "```markdown\nconst answer = 42;\n```"},
+                                    },
+                                }
+                            ],
+                        },
+                    }
+                }
+            }
+        }
+
+        parsed = _parse_article(result)
+
+        assert parsed == {
+            "article_title": "Article title",
+            "article_text": "Intro\n\n```markdown\nconst answer = 42;\n```\n\nOutro",
+        }
+
+    def test_hooeem_like_payload_keeps_multiple_markdown_blocks(self):
+        result = {
+            "article": {
+                "article_results": {
+                    "result": {
+                        "title": "I want to become a Claude architect (full course).",
+                        "content_state": {
+                            "blocks": [
+                                {"key": "a", "type": "unstyled", "text": "If you have no idea how to get started go to Claude and paste this prompt which will help you with domain 1:", "entityRanges": []},
+                                {"key": "b", "type": "atomic", "text": " ", "entityRanges": [{"offset": 0, "length": 1, "key": 4}]},
+                                {"key": "c", "type": "unstyled", "text": "What to build to learn: A multi-tool agent with 3-4 MCP tools.", "entityRanges": []},
+                                {"key": "d", "type": "atomic", "text": " ", "entityRanges": [{"offset": 0, "length": 1, "key": 5}]},
+                                {"key": "e", "type": "unstyled", "text": "Done.", "entityRanges": []},
+                            ],
+                            "entityMap": [
+                                {
+                                    "key": "4",
+                                    "value": {
+                                        "type": "MARKDOWN",
+                                        "data": {"markdown": "```markdown\nYou are an expert instructor teaching Domain 1.\n```"},
+                                    },
+                                },
+                                {
+                                    "key": "5",
+                                    "value": {
+                                        "type": "MARKDOWN",
+                                        "data": {"markdown": "```markdown\nBest for: predictable, structured tasks like code reviews.\n```"},
+                                    },
+                                },
+                            ],
+                        },
+                    }
+                }
+            }
+        }
+
+        parsed = _parse_article(result)
+
+        assert parsed == {
+            "article_title": "I want to become a Claude architect (full course).",
+            "article_text": (
+                "If you have no idea how to get started go to Claude and paste this prompt which will help you with domain 1:\n\n"
+                "```markdown\nYou are an expert instructor teaching Domain 1.\n```\n\n"
+                "What to build to learn: A multi-tool agent with 3-4 MCP tools.\n\n"
+                "```markdown\nBest for: predictable, structured tasks like code reviews.\n```\n\n"
+                "Done."
+            ),
+        }
+
+
 # ── TwitterClient._parse_tweet_result ─────────────────────────────────────

 class TestParseTweetResult:
@@ -514,6 +655,7 @@ class TestParseTweetResult:
        assert parse_tweet_result(self.SAMPLE_TWEET_RESULT, depth=3) is None


+
 # ── TwitterAPIError ──────────────────────────────────────────────────────

 class TestTwitterAPIError:
--- a/twitter_cli/parser.py
+++ b/twitter_cli/parser.py
@@ -113,6 +113,46 @@ def _extract_author(user_data, user_legacy):
 # ── Article parsing ──────────────────────────────────────────────────────


+def _normalize_article_entity_map(entity_map):
+    # type: (Any) -> Dict[str, Any]
+    """Normalize Draft.js entityMap that may arrive as dict or [{key, value}, ...]."""
+    if isinstance(entity_map, dict):
+        return {str(key): value for key, value in entity_map.items()}
+    if isinstance(entity_map, list):
+        normalized = {}  # type: Dict[str, Any]
+        for item in entity_map:
+            if not isinstance(item, dict):
+                continue
+            key = item.get("key")
+            value = item.get("value")
+            if key is None or value is None:
+                continue
+            normalized[str(key)] = value
+        return normalized
+    return {}
+
+
+
+def _extract_atomic_markdown(block, entity_map):
+    # type: (Dict[str, Any], Dict[str, Any]) -> List[str]
+    """Extract embedded markdown/code payloads from atomic Draft.js entities."""
+    parts = []  # type: List[str]
+    for entity_range in block.get("entityRanges", []) or []:
+        if not isinstance(entity_range, dict):
+            continue
+        entity_key = entity_range.get("key")
+        entity = entity_map.get(str(entity_key)) if entity_key is not None else None
+        if not isinstance(entity, dict):
+            continue
+        if str(entity.get("type") or "").upper() != "MARKDOWN":
+            continue
+        markdown = _deep_get(entity, "data", "markdown")
+        if isinstance(markdown, str) and markdown.strip():
+            parts.append(markdown.strip())
+    return parts
+
+
+
 def _parse_article(tweet_data):
    # type: (Dict[str, Any]) -> Dict[str, Any]
    """Extract Twitter Article data (long-form content) from a tweet.
@@ -130,12 +170,16 @@ def _parse_article(tweet_data):
    if not blocks:
        return {"article_title": title, "article_text": None}

+    entity_map = _normalize_article_entity_map(content_state.get("entityMap", {}))
+
    # Convert draft.js blocks to Markdown
    parts = []  # type: List[str]
    ordered_counter = 0
    for block in blocks:
        block_type = block.get("type", "unstyled")  # type: str
        if block_type == "atomic":
+            parts.extend(_extract_atomic_markdown(block, entity_map))
+            ordered_counter = 0
            continue
        text = block.get("text", "")  # type: str
        if not text: