Initial commit: latest3 YouTube extractor

2026-04-07 22:28:18 +02:00
commit 12de3f13df
3 changed files with 168 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,2 @@
+__pycache__/
+*.pyc
--- a/README.md
+++ b/README.md
@@ -0,0 +1,32 @@
+# youtube-latest3
+
+Fetch the latest uploads from a YouTube channel URL (including dynamic `@handle` URLs like `https://www.youtube.com/@ludwig/videos`).
+
+## Why this approach
+
+This tool resolves the channel ID from the exact channel URL, then reads YouTube's official uploads feed:
+
+- `https://www.youtube.com/feeds/videos.xml?channel_id=<CHANNEL_ID>`
+
+That feed is ordered by newest uploads, so fetching the first 3 entries gives the latest 3 videos.
+
+## Usage
+
+```bash
+python3 latest3.py "https://www.youtube.com/@ludwig/videos"
+python3 latest3.py "@ludwig" --json
+python3 latest3.py "https://www.youtube.com/@ludwig/videos" --limit 3 --json
+```
+
+## Output fields
+
+- `id` - video ID
+- `title` - video title
+- `url` - watch URL
+- `published` - ISO-8601 timestamp from the feed
+
+## Notes
+
+- Works with `@handle` URLs and plain `@handle` input.
+- No API key required.
+- If YouTube changes page markup for channel ID extraction, update `extract_channel_id()`.
--- a/latest3.py
+++ b/latest3.py
@@ -0,0 +1,134 @@
+#!/usr/bin/env python3
+import argparse
+import json
+import re
+import sys
+import urllib.error
+import urllib.parse
+import urllib.request
+import xml.etree.ElementTree as ET
+
+UA = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
+
+
+def http_get(url: str, timeout: int = 20) -> str:
+    req = urllib.request.Request(url, headers={"User-Agent": UA})
+    with urllib.request.urlopen(req, timeout=timeout) as r:
+        data = r.read()
+    return data.decode("utf-8", "ignore")
+
+
+def normalize_channel_url(raw: str) -> str:
+    raw = raw.strip()
+    if not raw:
+        raise ValueError("Empty URL/handle")
+
+    if raw.startswith("@"):
+        return f"https://www.youtube.com/{raw}/videos"
+
+    if not raw.startswith("http://") and not raw.startswith("https://"):
+        raw = "https://" + raw
+
+    p = urllib.parse.urlparse(raw)
+    if "youtube.com" not in p.netloc and "youtu.be" not in p.netloc:
+        raise ValueError("Not a YouTube URL/handle")
+
+    path = p.path or "/"
+    if path.startswith("/@") and not path.rstrip("/").endswith("/videos"):
+        path = path.rstrip("/") + "/videos"
+
+    return urllib.parse.urlunparse(("https", "www.youtube.com", path, "", "", ""))
+
+
+def extract_channel_id(html: str) -> str:
+    patterns = [
+        r'"channelId":"(UC[0-9A-Za-z_-]{22})"',
+        r'"externalId":"(UC[0-9A-Za-z_-]{22})"',
+        r'<meta itemprop="channelId" content="(UC[0-9A-Za-z_-]{22})">',
+    ]
+    for pat in patterns:
+        m = re.search(pat, html)
+        if m:
+            return m.group(1)
+    raise RuntimeError("Could not resolve channel ID from URL")
+
+
+def fetch_latest_from_feed(channel_id: str, limit: int = 3):
+    feed_url = f"https://www.youtube.com/feeds/videos.xml?channel_id={channel_id}"
+    xml_text = http_get(feed_url)
+    root = ET.fromstring(xml_text)
+
+    ns = {
+        "atom": "http://www.w3.org/2005/Atom",
+        "yt": "http://www.youtube.com/xml/schemas/2015",
+        "media": "http://search.yahoo.com/mrss/",
+    }
+
+    out = []
+    for entry in root.findall("atom:entry", ns)[:limit]:
+        vid = entry.findtext("yt:videoId", default="", namespaces=ns)
+        title = entry.findtext("atom:title", default="", namespaces=ns)
+        published = entry.findtext("atom:published", default="", namespaces=ns)
+        link_el = entry.find("atom:link", ns)
+        url = link_el.attrib.get("href") if link_el is not None else (f"https://www.youtube.com/watch?v={vid}" if vid else "")
+
+        out.append(
+            {
+                "id": vid,
+                "title": title,
+                "url": url,
+                "published": published,
+            }
+        )
+
+    if not out:
+        raise RuntimeError("No videos found in channel feed")
+    return out
+
+
+def get_latest_videos(channel_url_or_handle: str, limit: int = 3):
+    normalized = normalize_channel_url(channel_url_or_handle)
+    html = http_get(normalized)
+    channel_id = extract_channel_id(html)
+    videos = fetch_latest_from_feed(channel_id, limit=limit)
+    return {
+        "input": channel_url_or_handle,
+        "resolved_url": normalized,
+        "channel_id": channel_id,
+        "videos": videos,
+    }
+
+
+def main():
+    ap = argparse.ArgumentParser(description="Get latest YouTube uploads from a channel URL/handle")
+    ap.add_argument("channel", help="YouTube channel URL (including @handle) or @handle")
+    ap.add_argument("--limit", type=int, default=3, help="How many latest videos to return (default: 3)")
+    ap.add_argument("--json", action="store_true", help="Print full JSON output")
+    args = ap.parse_args()
+
+    if args.limit < 1 or args.limit > 20:
+        print("--limit must be between 1 and 20", file=sys.stderr)
+        sys.exit(2)
+
+    try:
+        data = get_latest_videos(args.channel, limit=args.limit)
+    except (urllib.error.URLError, urllib.error.HTTPError) as e:
+        print(f"Network error: {e}", file=sys.stderr)
+        sys.exit(1)
+    except Exception as e:
+        print(f"Error: {e}", file=sys.stderr)
+        sys.exit(1)
+
+    if args.json:
+        print(json.dumps(data, indent=2, ensure_ascii=False))
+        return
+
+    print(f"Channel ID: {data['channel_id']}")
+    for i, v in enumerate(data["videos"], start=1):
+        print(f"{i}. {v['title']}")
+        print(f"   {v['url']}")
+        print(f"   published: {v['published']}")
+
+
+if __name__ == "__main__":
+    main()