feat: preserve article atomic markdown blocks (#37)
* fix: preserve atomic markdown blocks in articles * test: add parser unit coverage for article markdown blocks
This commit is contained in:
@@ -24,8 +24,11 @@ from twitter_cli.graphql import (
|
|||||||
)
|
)
|
||||||
from twitter_cli.parser import (
|
from twitter_cli.parser import (
|
||||||
_deep_get,
|
_deep_get,
|
||||||
|
_extract_atomic_markdown,
|
||||||
_extract_cursor,
|
_extract_cursor,
|
||||||
_extract_media,
|
_extract_media,
|
||||||
|
_normalize_article_entity_map,
|
||||||
|
_parse_article,
|
||||||
_parse_int,
|
_parse_int,
|
||||||
parse_tweet_result,
|
parse_tweet_result,
|
||||||
parse_user_result,
|
parse_user_result,
|
||||||
@@ -414,6 +417,144 @@ class TestPaginationBehavior:
|
|||||||
assert [user.screen_name for user in users] == ["alice"]
|
assert [user.screen_name for user in users] == ["alice"]
|
||||||
|
|
||||||
|
|
||||||
|
# ── Article parsing helpers ───────────────────────────────────────────────
|
||||||
|
|
||||||
|
class TestNormalizeArticleEntityMap:
|
||||||
|
def test_accepts_dict_entity_map(self):
|
||||||
|
entity_map = {0: {"type": "MARKDOWN"}, "1": {"type": "LINK"}}
|
||||||
|
|
||||||
|
normalized = _normalize_article_entity_map(entity_map)
|
||||||
|
|
||||||
|
assert normalized == {"0": {"type": "MARKDOWN"}, "1": {"type": "LINK"}}
|
||||||
|
|
||||||
|
def test_accepts_list_entity_map(self):
|
||||||
|
entity_map = [
|
||||||
|
{"key": "4", "value": {"type": "MARKDOWN", "data": {"markdown": "```md\nhi\n```"}}},
|
||||||
|
{"key": 5, "value": {"type": "LINK", "data": {"url": "https://example.com"}}},
|
||||||
|
]
|
||||||
|
|
||||||
|
normalized = _normalize_article_entity_map(entity_map)
|
||||||
|
|
||||||
|
assert normalized == {
|
||||||
|
"4": {"type": "MARKDOWN", "data": {"markdown": "```md\nhi\n```"}},
|
||||||
|
"5": {"type": "LINK", "data": {"url": "https://example.com"}},
|
||||||
|
}
|
||||||
|
|
||||||
|
def test_rejects_unknown_shapes(self):
|
||||||
|
assert _normalize_article_entity_map(None) == {}
|
||||||
|
assert _normalize_article_entity_map("bad") == {}
|
||||||
|
|
||||||
|
|
||||||
|
class TestExtractAtomicMarkdown:
|
||||||
|
def test_extracts_markdown_entity(self):
|
||||||
|
block = {"entityRanges": [{"key": 4}]}
|
||||||
|
entity_map = {
|
||||||
|
"4": {"type": "MARKDOWN", "data": {"markdown": "```markdown\nconst answer = 42;\n```"}}
|
||||||
|
}
|
||||||
|
|
||||||
|
assert _extract_atomic_markdown(block, entity_map) == ["```markdown\nconst answer = 42;\n```"]
|
||||||
|
|
||||||
|
def test_ignores_non_markdown_entities(self):
|
||||||
|
block = {"entityRanges": [{"key": 0}, {"key": 1}]}
|
||||||
|
entity_map = {
|
||||||
|
"0": {"type": "MEDIA", "data": {"mediaItems": []}},
|
||||||
|
"1": {"type": "LINK", "data": {"url": "https://example.com"}},
|
||||||
|
}
|
||||||
|
|
||||||
|
assert _extract_atomic_markdown(block, entity_map) == []
|
||||||
|
|
||||||
|
def test_ignores_blank_markdown(self):
|
||||||
|
block = {"entityRanges": [{"key": 4}]}
|
||||||
|
entity_map = {"4": {"type": "MARKDOWN", "data": {"markdown": " \n"}}}
|
||||||
|
|
||||||
|
assert _extract_atomic_markdown(block, entity_map) == []
|
||||||
|
|
||||||
|
|
||||||
|
class TestParseArticle:
|
||||||
|
def test_preserves_atomic_markdown_between_text_blocks(self):
|
||||||
|
result = {
|
||||||
|
"article": {
|
||||||
|
"article_results": {
|
||||||
|
"result": {
|
||||||
|
"title": "Article title",
|
||||||
|
"content_state": {
|
||||||
|
"blocks": [
|
||||||
|
{"key": "a", "type": "unstyled", "text": "Intro", "entityRanges": []},
|
||||||
|
{"key": "b", "type": "atomic", "text": " ", "entityRanges": [{"offset": 0, "length": 1, "key": 4}]},
|
||||||
|
{"key": "c", "type": "unstyled", "text": "Outro", "entityRanges": []},
|
||||||
|
],
|
||||||
|
"entityMap": [
|
||||||
|
{
|
||||||
|
"key": "4",
|
||||||
|
"value": {
|
||||||
|
"type": "MARKDOWN",
|
||||||
|
"data": {"markdown": "```markdown\nconst answer = 42;\n```"},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
],
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
parsed = _parse_article(result)
|
||||||
|
|
||||||
|
assert parsed == {
|
||||||
|
"article_title": "Article title",
|
||||||
|
"article_text": "Intro\n\n```markdown\nconst answer = 42;\n```\n\nOutro",
|
||||||
|
}
|
||||||
|
|
||||||
|
def test_hooeem_like_payload_keeps_multiple_markdown_blocks(self):
|
||||||
|
result = {
|
||||||
|
"article": {
|
||||||
|
"article_results": {
|
||||||
|
"result": {
|
||||||
|
"title": "I want to become a Claude architect (full course).",
|
||||||
|
"content_state": {
|
||||||
|
"blocks": [
|
||||||
|
{"key": "a", "type": "unstyled", "text": "If you have no idea how to get started go to Claude and paste this prompt which will help you with domain 1:", "entityRanges": []},
|
||||||
|
{"key": "b", "type": "atomic", "text": " ", "entityRanges": [{"offset": 0, "length": 1, "key": 4}]},
|
||||||
|
{"key": "c", "type": "unstyled", "text": "What to build to learn: A multi-tool agent with 3-4 MCP tools.", "entityRanges": []},
|
||||||
|
{"key": "d", "type": "atomic", "text": " ", "entityRanges": [{"offset": 0, "length": 1, "key": 5}]},
|
||||||
|
{"key": "e", "type": "unstyled", "text": "Done.", "entityRanges": []},
|
||||||
|
],
|
||||||
|
"entityMap": [
|
||||||
|
{
|
||||||
|
"key": "4",
|
||||||
|
"value": {
|
||||||
|
"type": "MARKDOWN",
|
||||||
|
"data": {"markdown": "```markdown\nYou are an expert instructor teaching Domain 1.\n```"},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"key": "5",
|
||||||
|
"value": {
|
||||||
|
"type": "MARKDOWN",
|
||||||
|
"data": {"markdown": "```markdown\nBest for: predictable, structured tasks like code reviews.\n```"},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
parsed = _parse_article(result)
|
||||||
|
|
||||||
|
assert parsed == {
|
||||||
|
"article_title": "I want to become a Claude architect (full course).",
|
||||||
|
"article_text": (
|
||||||
|
"If you have no idea how to get started go to Claude and paste this prompt which will help you with domain 1:\n\n"
|
||||||
|
"```markdown\nYou are an expert instructor teaching Domain 1.\n```\n\n"
|
||||||
|
"What to build to learn: A multi-tool agent with 3-4 MCP tools.\n\n"
|
||||||
|
"```markdown\nBest for: predictable, structured tasks like code reviews.\n```\n\n"
|
||||||
|
"Done."
|
||||||
|
),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
# ── TwitterClient._parse_tweet_result ─────────────────────────────────────
|
# ── TwitterClient._parse_tweet_result ─────────────────────────────────────
|
||||||
|
|
||||||
class TestParseTweetResult:
|
class TestParseTweetResult:
|
||||||
@@ -514,6 +655,7 @@ class TestParseTweetResult:
|
|||||||
assert parse_tweet_result(self.SAMPLE_TWEET_RESULT, depth=3) is None
|
assert parse_tweet_result(self.SAMPLE_TWEET_RESULT, depth=3) is None
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# ── TwitterAPIError ──────────────────────────────────────────────────────
|
# ── TwitterAPIError ──────────────────────────────────────────────────────
|
||||||
|
|
||||||
class TestTwitterAPIError:
|
class TestTwitterAPIError:
|
||||||
|
|||||||
@@ -113,6 +113,46 @@ def _extract_author(user_data, user_legacy):
|
|||||||
# ── Article parsing ──────────────────────────────────────────────────────
|
# ── Article parsing ──────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def _normalize_article_entity_map(entity_map):
|
||||||
|
# type: (Any) -> Dict[str, Any]
|
||||||
|
"""Normalize Draft.js entityMap that may arrive as dict or [{key, value}, ...]."""
|
||||||
|
if isinstance(entity_map, dict):
|
||||||
|
return {str(key): value for key, value in entity_map.items()}
|
||||||
|
if isinstance(entity_map, list):
|
||||||
|
normalized = {} # type: Dict[str, Any]
|
||||||
|
for item in entity_map:
|
||||||
|
if not isinstance(item, dict):
|
||||||
|
continue
|
||||||
|
key = item.get("key")
|
||||||
|
value = item.get("value")
|
||||||
|
if key is None or value is None:
|
||||||
|
continue
|
||||||
|
normalized[str(key)] = value
|
||||||
|
return normalized
|
||||||
|
return {}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_atomic_markdown(block, entity_map):
|
||||||
|
# type: (Dict[str, Any], Dict[str, Any]) -> List[str]
|
||||||
|
"""Extract embedded markdown/code payloads from atomic Draft.js entities."""
|
||||||
|
parts = [] # type: List[str]
|
||||||
|
for entity_range in block.get("entityRanges", []) or []:
|
||||||
|
if not isinstance(entity_range, dict):
|
||||||
|
continue
|
||||||
|
entity_key = entity_range.get("key")
|
||||||
|
entity = entity_map.get(str(entity_key)) if entity_key is not None else None
|
||||||
|
if not isinstance(entity, dict):
|
||||||
|
continue
|
||||||
|
if str(entity.get("type") or "").upper() != "MARKDOWN":
|
||||||
|
continue
|
||||||
|
markdown = _deep_get(entity, "data", "markdown")
|
||||||
|
if isinstance(markdown, str) and markdown.strip():
|
||||||
|
parts.append(markdown.strip())
|
||||||
|
return parts
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def _parse_article(tweet_data):
|
def _parse_article(tweet_data):
|
||||||
# type: (Dict[str, Any]) -> Dict[str, Any]
|
# type: (Dict[str, Any]) -> Dict[str, Any]
|
||||||
"""Extract Twitter Article data (long-form content) from a tweet.
|
"""Extract Twitter Article data (long-form content) from a tweet.
|
||||||
@@ -130,12 +170,16 @@ def _parse_article(tweet_data):
|
|||||||
if not blocks:
|
if not blocks:
|
||||||
return {"article_title": title, "article_text": None}
|
return {"article_title": title, "article_text": None}
|
||||||
|
|
||||||
|
entity_map = _normalize_article_entity_map(content_state.get("entityMap", {}))
|
||||||
|
|
||||||
# Convert draft.js blocks to Markdown
|
# Convert draft.js blocks to Markdown
|
||||||
parts = [] # type: List[str]
|
parts = [] # type: List[str]
|
||||||
ordered_counter = 0
|
ordered_counter = 0
|
||||||
for block in blocks:
|
for block in blocks:
|
||||||
block_type = block.get("type", "unstyled") # type: str
|
block_type = block.get("type", "unstyled") # type: str
|
||||||
if block_type == "atomic":
|
if block_type == "atomic":
|
||||||
|
parts.extend(_extract_atomic_markdown(block, entity_map))
|
||||||
|
ordered_counter = 0
|
||||||
continue
|
continue
|
||||||
text = block.get("text", "") # type: str
|
text = block.get("text", "") # type: str
|
||||||
if not text:
|
if not text:
|
||||||
|
|||||||
Reference in New Issue
Block a user