fix(article): preserve inline hyperlinks in markdown export

2026-03-21 13:34:35 +08:00
parent 199a1490f9
commit c63b5a2ede
2 changed files with 118 additions and 1 deletions
--- a/tests/test_client.py
+++ b/tests/test_client.py
@@ -31,6 +31,7 @@ from twitter_cli.parser import (
    _normalize_article_entity_map,
    _parse_article,
    _parse_int,
    _render_article_text_block,
    parse_tweet_result,
    parse_user_result,
 )
@@ -475,6 +476,31 @@ class TestExtractAtomicMarkdown:
        assert _extract_atomic_markdown(block, entity_map) == []
 class TestRenderArticleTextBlock:
    def test_renders_inline_link_entities_as_markdown(self):
        block = {
            "text": "Read the docs and the course.",
            "entityRanges": [
                {"key": 0, "offset": 9, "length": 4},
                {"key": 1, "offset": 22, "length": 6},
            ],
        }
        entity_map = {
            "0": {"type": "LINK", "data": {"url": "https://docs.example.com"}},
            "1": {"type": "LINK", "data": {"url": "https://course.example.com"}},
        }
        assert _render_article_text_block(block, entity_map) == (
            "Read the [docs](https://docs.example.com) and the [course](https://course.example.com)."
        )
    def test_ignores_non_link_entities(self):
        block = {"text": "Intro", "entityRanges": [{"key": 4, "offset": 0, "length": 5}]}
        entity_map = {"4": {"type": "MARKDOWN", "data": {"markdown": "```md\nIntro\n```"}}}
        assert _render_article_text_block(block, entity_map) == "Intro"
 class TestParseArticle:
    def test_preserves_atomic_markdown_between_text_blocks(self):
        result = {
@@ -624,6 +650,55 @@ class TestParseArticle:
            ),
        }
    def test_renders_inline_hyperlinks_from_article_entity_ranges(self):
        result = {
            "article": {
                "article_results": {
                    "result": {
                        "title": "Linked article",
                        "content_state": {
                            "blocks": [
                                {
                                    "key": "a",
                                    "type": "unstyled",
                                    "text": "Read the docs and the course.",
                                    "entityRanges": [
                                        {"key": 0, "offset": 9, "length": 4},
                                        {"key": 1, "offset": 22, "length": 6},
                                    ],
                                }
                            ],
                            "entityMap": [
                                {
                                    "key": "0",
                                    "value": {
                                        "type": "LINK",
                                        "data": {"url": "https://docs.example.com"},
                                    },
                                },
                                {
                                    "key": "1",
                                    "value": {
                                        "type": "LINK",
                                        "data": {"url": "https://course.example.com"},
                                    },
                                },
                            ],
                        },
                    }
                }
            }
        }
        parsed = _parse_article(result)
        assert parsed == {
            "article_title": "Linked article",
            "article_text": (
                "Read the [docs](https://docs.example.com) and the [course](https://course.example.com)."
            ),
        }
 # ── TwitterClient._parse_tweet_result ─────────────────────────────────────
--- a/twitter_cli/parser.py
+++ b/twitter_cli/parser.py
@@ -215,6 +215,48 @@ def _extract_atomic_markdown(block, entity_map):
    return parts
 def _render_article_text_block(block, entity_map):
    # type: (Dict[str, Any], Dict[str, Any]) -> str
    """Render a Draft.js text block, converting inline hyperlinks to Markdown."""
    text = block.get("text", "")
    if not isinstance(text, str) or not text:
        return ""
    rendered = text
    ranges = []
    for entity_range in block.get("entityRanges", []) or []:
        if not isinstance(entity_range, dict):
            continue
        entity_key = entity_range.get("key")
        entity = entity_map.get(str(entity_key)) if entity_key is not None else None
        if not isinstance(entity, dict):
            continue
        if str(entity.get("type") or "").upper() != "LINK":
            continue
        offset = entity_range.get("offset")
        length = entity_range.get("length")
        if not isinstance(offset, int) or not isinstance(length, int) or length <= 0:
            continue
        url = _deep_get(entity, "data", "url")
        if not isinstance(url, str) or not url.strip():
            continue
        ranges.append((offset, length, url.strip()))
    for offset, length, url in sorted(ranges, reverse=True):
        if offset < 0 or offset + length > len(rendered):
            continue
        label = rendered[offset:offset + length]
        if not label:
            continue
        rendered = "%s[%s](%s)%s" % (
            rendered[:offset],
            label,
            url,
            rendered[offset + length:],
        )
    return rendered
 def _find_article_caption(value):
    # type: (Any) -> Optional[str]
    """Best-effort extraction of image caption/alt text from article entity data."""
@@ -289,7 +331,7 @@ def _parse_article(tweet_data):
            parts.extend(_extract_article_images(block, entity_map, media_url_map))
            ordered_counter = 0
            continue
-        text = block.get("text", "")  # type: str
+        text = _render_article_text_block(block, entity_map)
        if not text:
            continue
        if block_type != "ordered-list-item":