Arxiv URL to Obsidian Markdown Note in Python

```python from __future__ import annotations import argparse import datetime as dt import os import re import sys import textwrap import urllib.request import xml.etree.ElementTree as ET from titlecase import titlecase ARXIV_API = "https://export.arxiv.org/api/query?id_list={id}" NS = { "atom": "http://www.w3.org/2005/Atom", "arxiv": "http://arxiv.org/schemas/atom", } def parse_args(): p = argparse.ArgumentParser(description="Make an Obsidian Markdown note from an arXiv URL/ID.") p.add_argument("url_or_id", help="arXiv URL (abs/pdf) or bare ID (e.g., 2407.12345)") p.add_argument("-o", "--outdir", default=".", help="Directory to write the .md file (default: current dir)") return p.parse_args() def extract_arxiv_id(s: str) -> str: """ Accepts: - bare IDs: 2407.12345 or 2407.12345v2 or cs/0601001 - URLs: https://arxiv.org/abs/2407.12345v2, /pdf/..., /format/... Returns canonical id with version if present. """ s = s.strip() # If it's already a plausible ID, return id_pat = r"(?:(?:\d{4}\.\d{4,5})(?:v\d+)?)|(?:[a-z\-]+(?:\.[A-Z]{2})?/\d{7}(?:v\d+)?)" if re.fullmatch(id_pat, s, re.IGNORECASE): return s # Try to extract from known URL patterns m = re.search(r"/abs/([^/?#]+)", s) if not m: m = re.search(r"/pdf/([^/?#]+)(?:\.pdf)?", s) if not m: m = re.search(r"/format/([^/?#]+)", s) if m: cand = m.group(1) if re.fullmatch(id_pat, cand, re.IGNORECASE): return cand raise ValueError(f"Could not parse an arXiv ID from: {s}") def fetch_atom_xml(arxiv_id: str) -> ET.Element: url = ARXIV_API.format(id=arxiv_id) with urllib.request.urlopen(url) as resp: data = resp.read() try: root = ET.fromstring(data) return root except ET.ParseError as e: raise RuntimeError(f"Failed to parse arXiv API XML for {arxiv_id}: {e}") def get_text(elem: ET.Element | None) -> str: return (elem.text or "").strip() if elem is not None else "" def first(elem: ET.Element, path: str) -> ET.Element | None: return elem.find(path, NS) def all_of(elem: ET.Element, path: str) -> list[ET.Element]: return elem.findall(path, NS) def parse_entry(root: ET.Element) -> dict: entry = first(root, "atom:entry") if entry is None: raise RuntimeError("No entry found in arXiv API response (bad ID?)") title = get_text(first(entry, "atom:title")) summary = get_text(first(entry, "atom:summary")) published = get_text(first(entry, "atom:published")) # e.g. 2024-07-14T17:38:55Z updated = get_text(first(entry, "atom:updated")) year = published[:4] if published else "" # authors authors = [get_text(first(a, "atom:name")) for a in all_of(entry, "atom:author")] # links (pdf + abs) pdf_url = "" abs_url = "" for link in all_of(entry, "atom:link"): href = link.get("href", "") rel = link.get("rel", "") typ = link.get("type", "") if typ == "application/pdf" or link.get("title", "") == "pdf": pdf_url = href if rel == "alternate" and "arxiv.org/abs" in href: abs_url = href # doi (optional) doi = get_text(first(entry, "arxiv:doi")) # journal ref (optional) journal_ref = get_text(first(entry, "arxiv:journal_ref")) # categories primary_cat = "" pc = first(entry, "arxiv:primary_category") if pc is not None: primary_cat = pc.get("term", "") # e.g. cs.CV categories = [c.get("term", "") for c in all_of(entry, "atom:category") if c.get("term")] return { "title": title, "summary": summary, "published": published, "updated": updated, "year": year, "authors": authors, "pdf_url": pdf_url, "abs_url": abs_url, "doi": doi, "journal_ref": journal_ref, "primary_category": primary_cat, "categories": categories, } def slugify(s: str, maxlen: int = 120) -> str: s = re.sub(r"[^\w\s\-]+", "", s, flags=re.UNICODE) s = re.sub(r"\s+", " ", s).strip() #s = s.replace(" ", "-") return s[:maxlen].strip() def render_markdown(arxiv_id: str, meta: dict) -> str: today = dt.date.today().strftime("%Y-%m-%d") frontmatter = "\n".join([ "---", f'title: "{meta["title"]}"', f"authors: {', '.join(meta['authors']) if meta['authors'] else '- '}", f"year: {meta['year'] or ''}", f"arxiv_id: {arxiv_id}", f"doi: {meta['doi'] or ''}", f"abs_url: {meta['abs_url'] or ('https://arxiv.org/abs/' + arxiv_id)}", f"pdf_url: {meta['pdf_url'] or ('https://arxiv.org/pdf/' + arxiv_id + '.pdf')}", f"primary_category: {meta['primary_category']}", f"categories: [{', '.join(meta['categories'])}]", "publish: true", "---", "", # blank line after frontmatter ]) abstract = ' '.join(meta["summary"].splitlines()) #abstract = meta["summary"] body = "\n".join([ "## 📚 Reference Info", f"| | |", f"|---|---|", f"| Authors | {', '.join(meta['authors']) if meta['authors'] else ''}", f"| Year | {meta['year'] or ''}", f"| Journal/Conference | {meta['journal_ref'] or ''}", f"| DOI | {meta['doi'] or ''}", f"| ABS | {meta['abs_url'] or ('https://arxiv.org/abs/' + arxiv_id)}", f"| PDF | {meta['pdf_url'] or ('https://arxiv.org/pdf/' + arxiv_id + '.pdf')}", "", "## 📝 Abstract (from arXiv)", abstract, "", f'<iframe src="{("https://arxiv.org/pdf/" + arxiv_id)}" width="100%" style="height: 80vh;"></iframe>', "", ]) return frontmatter + body ``` ```python # Example usage: URL="https://arxiv.org/pdf/2305.15253" OUT_DIR="C:/Users/lukeb/Documents/Obsidian/Vault/Great Papers by Other People" arxiv_id = extract_arxiv_id(URL) root = fetch_atom_xml(arxiv_id) meta = parse_entry(root) base = f"{titlecase(slugify(meta['title']))}.md" outdir = os.path.abspath(OUT_DIR) os.makedirs(outdir, exist_ok=True) path = os.path.join(outdir, base) md = render_markdown(arxiv_id, meta) with open(path, "w", encoding="utf-8") as f: f.write(md) print(f"Wrote: {path}") ```