diff --git a/tests/test_client.py b/tests/test_client.py index cb92a14..7bb1d5d 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -31,6 +31,7 @@ from twitter_cli.parser import ( _normalize_article_entity_map, _parse_article, _parse_int, + _render_article_text_block, parse_tweet_result, parse_user_result, ) @@ -475,6 +476,31 @@ class TestExtractAtomicMarkdown: assert _extract_atomic_markdown(block, entity_map) == [] +class TestRenderArticleTextBlock: + def test_renders_inline_link_entities_as_markdown(self): + block = { + "text": "Read the docs and the course.", + "entityRanges": [ + {"key": 0, "offset": 9, "length": 4}, + {"key": 1, "offset": 22, "length": 6}, + ], + } + entity_map = { + "0": {"type": "LINK", "data": {"url": "https://docs.example.com"}}, + "1": {"type": "LINK", "data": {"url": "https://course.example.com"}}, + } + + assert _render_article_text_block(block, entity_map) == ( + "Read the [docs](https://docs.example.com) and the [course](https://course.example.com)." + ) + + def test_ignores_non_link_entities(self): + block = {"text": "Intro", "entityRanges": [{"key": 4, "offset": 0, "length": 5}]} + entity_map = {"4": {"type": "MARKDOWN", "data": {"markdown": "```md\nIntro\n```"}}} + + assert _render_article_text_block(block, entity_map) == "Intro" + + class TestParseArticle: def test_preserves_atomic_markdown_between_text_blocks(self): result = { @@ -624,6 +650,55 @@ class TestParseArticle: ), } + def test_renders_inline_hyperlinks_from_article_entity_ranges(self): + result = { + "article": { + "article_results": { + "result": { + "title": "Linked article", + "content_state": { + "blocks": [ + { + "key": "a", + "type": "unstyled", + "text": "Read the docs and the course.", + "entityRanges": [ + {"key": 0, "offset": 9, "length": 4}, + {"key": 1, "offset": 22, "length": 6}, + ], + } + ], + "entityMap": [ + { + "key": "0", + "value": { + "type": "LINK", + "data": {"url": "https://docs.example.com"}, + }, + }, + { + "key": "1", + "value": { + "type": "LINK", + "data": {"url": "https://course.example.com"}, + }, + }, + ], + }, + } + } + } + } + + parsed = _parse_article(result) + + assert parsed == { + "article_title": "Linked article", + "article_text": ( + "Read the [docs](https://docs.example.com) and the [course](https://course.example.com)." + ), + } + # ── TwitterClient._parse_tweet_result ───────────────────────────────────── diff --git a/twitter_cli/parser.py b/twitter_cli/parser.py index f0f62c8..d56627a 100644 --- a/twitter_cli/parser.py +++ b/twitter_cli/parser.py @@ -215,6 +215,48 @@ def _extract_atomic_markdown(block, entity_map): return parts +def _render_article_text_block(block, entity_map): + # type: (Dict[str, Any], Dict[str, Any]) -> str + """Render a Draft.js text block, converting inline hyperlinks to Markdown.""" + text = block.get("text", "") + if not isinstance(text, str) or not text: + return "" + + rendered = text + ranges = [] + for entity_range in block.get("entityRanges", []) or []: + if not isinstance(entity_range, dict): + continue + entity_key = entity_range.get("key") + entity = entity_map.get(str(entity_key)) if entity_key is not None else None + if not isinstance(entity, dict): + continue + if str(entity.get("type") or "").upper() != "LINK": + continue + offset = entity_range.get("offset") + length = entity_range.get("length") + if not isinstance(offset, int) or not isinstance(length, int) or length <= 0: + continue + url = _deep_get(entity, "data", "url") + if not isinstance(url, str) or not url.strip(): + continue + ranges.append((offset, length, url.strip())) + + for offset, length, url in sorted(ranges, reverse=True): + if offset < 0 or offset + length > len(rendered): + continue + label = rendered[offset:offset + length] + if not label: + continue + rendered = "%s[%s](%s)%s" % ( + rendered[:offset], + label, + url, + rendered[offset + length:], + ) + return rendered + + def _find_article_caption(value): # type: (Any) -> Optional[str] """Best-effort extraction of image caption/alt text from article entity data.""" @@ -289,7 +331,7 @@ def _parse_article(tweet_data): parts.extend(_extract_article_images(block, entity_map, media_url_map)) ordered_counter = 0 continue - text = block.get("text", "") # type: str + text = _render_article_text_block(block, entity_map) if not text: continue if block_type != "ordered-list-item":