```python
from __future__ import annotations
import argparse
import datetime as dt
import os
import re
import sys
import textwrap
import urllib.request
import xml.etree.ElementTree as ET
from titlecase import titlecase
ARXIV_API = "https://export.arxiv.org/api/query?id_list={id}"
NS = {
"atom": "http://www.w3.org/2005/Atom",
"arxiv": "http://arxiv.org/schemas/atom",
}
def parse_args():
p = argparse.ArgumentParser(description="Make an Obsidian Markdown note from an arXiv URL/ID.")
p.add_argument("url_or_id", help="arXiv URL (abs/pdf) or bare ID (e.g., 2407.12345)")
p.add_argument("-o", "--outdir", default=".", help="Directory to write the .md file (default: current dir)")
return p.parse_args()
def extract_arxiv_id(s: str) -> str:
"""
Accepts:
- bare IDs: 2407.12345 or 2407.12345v2 or cs/0601001
- URLs: https://arxiv.org/abs/2407.12345v2, /pdf/..., /format/...
Returns canonical id with version if present.
"""
s = s.strip()
# If it's already a plausible ID, return
id_pat = r"(?:(?:\d{4}\.\d{4,5})(?:v\d+)?)|(?:[a-z\-]+(?:\.[A-Z]{2})?/\d{7}(?:v\d+)?)"
if re.fullmatch(id_pat, s, re.IGNORECASE):
return s
# Try to extract from known URL patterns
m = re.search(r"/abs/([^/?#]+)", s)
if not m:
m = re.search(r"/pdf/([^/?#]+)(?:\.pdf)?", s)
if not m:
m = re.search(r"/format/([^/?#]+)", s)
if m:
cand = m.group(1)
if re.fullmatch(id_pat, cand, re.IGNORECASE):
return cand
raise ValueError(f"Could not parse an arXiv ID from: {s}")
def fetch_atom_xml(arxiv_id: str) -> ET.Element:
url = ARXIV_API.format(id=arxiv_id)
with urllib.request.urlopen(url) as resp:
data = resp.read()
try:
root = ET.fromstring(data)
return root
except ET.ParseError as e:
raise RuntimeError(f"Failed to parse arXiv API XML for {arxiv_id}: {e}")
def get_text(elem: ET.Element | None) -> str:
return (elem.text or "").strip() if elem is not None else ""
def first(elem: ET.Element, path: str) -> ET.Element | None:
return elem.find(path, NS)
def all_of(elem: ET.Element, path: str) -> list[ET.Element]:
return elem.findall(path, NS)
def parse_entry(root: ET.Element) -> dict:
entry = first(root, "atom:entry")
if entry is None:
raise RuntimeError("No entry found in arXiv API response (bad ID?)")
title = get_text(first(entry, "atom:title"))
summary = get_text(first(entry, "atom:summary"))
published = get_text(first(entry, "atom:published")) # e.g. 2024-07-14T17:38:55Z
updated = get_text(first(entry, "atom:updated"))
year = published[:4] if published else ""
# authors
authors = [get_text(first(a, "atom:name")) for a in all_of(entry, "atom:author")]
# links (pdf + abs)
pdf_url = ""
abs_url = ""
for link in all_of(entry, "atom:link"):
href = link.get("href", "")
rel = link.get("rel", "")
typ = link.get("type", "")
if typ == "application/pdf" or link.get("title", "") == "pdf":
pdf_url = href
if rel == "alternate" and "arxiv.org/abs" in href:
abs_url = href
# doi (optional)
doi = get_text(first(entry, "arxiv:doi"))
# journal ref (optional)
journal_ref = get_text(first(entry, "arxiv:journal_ref"))
# categories
primary_cat = ""
pc = first(entry, "arxiv:primary_category")
if pc is not None:
primary_cat = pc.get("term", "") # e.g. cs.CV
categories = [c.get("term", "") for c in all_of(entry, "atom:category") if c.get("term")]
return {
"title": title,
"summary": summary,
"published": published,
"updated": updated,
"year": year,
"authors": authors,
"pdf_url": pdf_url,
"abs_url": abs_url,
"doi": doi,
"journal_ref": journal_ref,
"primary_category": primary_cat,
"categories": categories,
}
def slugify(s: str, maxlen: int = 120) -> str:
s = re.sub(r"[^\w\s\-]+", "", s, flags=re.UNICODE)
s = re.sub(r"\s+", " ", s).strip()
#s = s.replace(" ", "-")
return s[:maxlen].strip()
def render_markdown(arxiv_id: str, meta: dict) -> str:
today = dt.date.today().strftime("%Y-%m-%d")
frontmatter = "\n".join([
"---",
f'title: "{meta["title"]}"',
f"authors: {', '.join(meta['authors']) if meta['authors'] else '- '}",
f"year: {meta['year'] or ''}",
f"arxiv_id: {arxiv_id}",
f"doi: {meta['doi'] or ''}",
f"abs_url: {meta['abs_url'] or ('https://arxiv.org/abs/' + arxiv_id)}",
f"pdf_url: {meta['pdf_url'] or ('https://arxiv.org/pdf/' + arxiv_id + '.pdf')}",
f"primary_category: {meta['primary_category']}",
f"categories: [{', '.join(meta['categories'])}]",
"publish: true",
"---",
"", # blank line after frontmatter
])
abstract = ' '.join(meta["summary"].splitlines())
#abstract = meta["summary"]
body = "\n".join([
"## 📚 Reference Info",
f"| | |",
f"|---|---|",
f"| Authors | {', '.join(meta['authors']) if meta['authors'] else ''}",
f"| Year | {meta['year'] or ''}",
f"| Journal/Conference | {meta['journal_ref'] or ''}",
f"| DOI | {meta['doi'] or ''}",
f"| ABS | {meta['abs_url'] or ('https://arxiv.org/abs/' + arxiv_id)}",
f"| PDF | {meta['pdf_url'] or ('https://arxiv.org/pdf/' + arxiv_id + '.pdf')}",
"",
"## 📝 Abstract (from arXiv)",
abstract,
"",
f'<iframe src="{("https://arxiv.org/pdf/" + arxiv_id)}" width="100%" style="height: 80vh;"></iframe>',
"",
])
return frontmatter + body
```
```python
# Example usage:
URL="https://arxiv.org/pdf/2305.15253"
OUT_DIR="C:/Users/lukeb/Documents/Obsidian/Vault/Great Papers by Other People"
arxiv_id = extract_arxiv_id(URL)
root = fetch_atom_xml(arxiv_id)
meta = parse_entry(root)
base = f"{titlecase(slugify(meta['title']))}.md"
outdir = os.path.abspath(OUT_DIR)
os.makedirs(outdir, exist_ok=True)
path = os.path.join(outdir, base)
md = render_markdown(arxiv_id, meta)
with open(path, "w", encoding="utf-8") as f:
f.write(md)
print(f"Wrote: {path}")
```