levineuwirth.org/tools/import-poetry.py

#!/usr/bin/env python3
"""
import-poetry.py — Import a poetry collection from a Project Gutenberg plain-text file.

Produces:
  content/poetry/{collection-slug}/index.md          Collection index page
  content/poetry/{collection-slug}/{poem-slug}.md    One file per poem

Usage:
  python tools/import-poetry.py gutenberg.txt \\
      --poet "William Shakespeare" \\
      --collection "Sonnets" \\
      --date 1609 \\
      --title-prefix "Sonnet" \\
      --tags poetry,english \\
      [--slug shakespeare-sonnets] \\
      [--interactive] \\
      [--dry-run] \\
      [--overwrite]

The --title-prefix controls per-poem title generation:
  "Sonnet" → "Sonnet 1", "Sonnet 2", ..., slug "sonnet-1", "sonnet-2"
  If omitted, defaults to the singular of --collection (strips trailing 's').
"""

import argparse
import re
import sys
from pathlib import Path
from typing import Optional

REPO_ROOT   = Path(__file__).parent.parent
POETRY_DIR  = REPO_ROOT / "content" / "poetry"

# ---------------------------------------------------------------------------
# Roman numeral conversion
# ---------------------------------------------------------------------------

_ROMAN_VALS = [
    ("M", 1000), ("CM", 900), ("D", 500), ("CD", 400),
    ("C",  100), ("XC",  90), ("L",  50), ("XL",  40),
    ("X",   10), ("IX",   9), ("V",   5), ("IV",   4), ("I", 1),
]

def roman_to_int(s: str) -> Optional[int]:
    s = s.upper().strip()
    i, result = 0, 0
    for numeral, value in _ROMAN_VALS:
        while s[i : i + len(numeral)] == numeral:
            result += value
            i += len(numeral)
    return result if i == len(s) and result > 0 else None

# Matches a line that is *solely* a Roman numeral (with optional period/trailing space).
# Anchored; leading/trailing whitespace stripped by caller.
_ROMAN_RE = re.compile(
    r"^(M{0,4}(?:CM|CD|D?C{0,3})(?:XC|XL|L?X{0,3})(?:IX|IV|V?I{0,3}))\.?$",
    re.IGNORECASE,
)

# ---------------------------------------------------------------------------
# Slug generation
# ---------------------------------------------------------------------------

def slugify(s: str) -> str:
    s = s.lower()
    s = re.sub(r"[^\w\s-]", "", s)
    s = re.sub(r"[\s_]+", "-", s)
    s = re.sub(r"-+", "-", s)
    return s.strip("-")

# ---------------------------------------------------------------------------
# Gutenberg parsing
# ---------------------------------------------------------------------------

_START_RE = re.compile(r"\*\*\* START OF THE PROJECT GUTENBERG", re.IGNORECASE)
_END_RE   = re.compile(r"\*\*\* END OF THE PROJECT GUTENBERG",   re.IGNORECASE)

def strip_gutenberg(text: str) -> tuple[str, str]:
    """Return (header, body) where body is the text between the PG markers."""
    lines = text.splitlines()
    start = 0
    end   = len(lines)
    for i, line in enumerate(lines):
        if _START_RE.search(line):
            start = i + 1
            break
    for i, line in enumerate(lines):
        if _END_RE.search(line):
            end = i
            break
    header = "\n".join(lines[:start])
    body   = "\n".join(lines[start:end])
    return header, body

def parse_gutenberg_meta(header: str) -> dict:
    meta: dict = {}
    for line in header.splitlines():
        for field in ("Title", "Author", "Release date", "Release Date"):
            if line.startswith(field + ":"):
                meta[field.lower().replace(" ", "-")] = line.split(":", 1)[1].strip()
    return meta

# ---------------------------------------------------------------------------
# Poem splitting
# ---------------------------------------------------------------------------

def split_poems(body: str) -> list[dict]:
    """
    Split body text into individual poems using Roman-numeral headings as
    boundaries. Returns a list of dicts:
        { number: int, roman: str, lines: list[str] }

    Lines are raw — call normalize_stanzas() before writing.
    """
    lines = body.splitlines()
    poems: list[dict] = []
    current: Optional[dict] = None

    for line in lines:
        stripped = line.strip()
        m = _ROMAN_RE.match(stripped)
        if m and stripped:  # empty stripped means blank line, not a heading
            number = roman_to_int(m.group(1))
            if number is not None:
                if current is not None and _has_content(current["lines"]):
                    poems.append(current)
                current = {"number": number, "roman": m.group(1).upper(), "lines": []}
                continue
        if current is not None:
            current["lines"].append(line)

    if current is not None and _has_content(current["lines"]):
        poems.append(current)

    return poems

def _has_content(lines: list[str], min_words: int = 4) -> bool:
    text = " ".join(l.strip() for l in lines if l.strip())
    return len(text.split()) >= min_words

# ---------------------------------------------------------------------------
# Stanza normalization
# ---------------------------------------------------------------------------

def normalize_stanzas(raw: list[str]) -> list[str]:
    """
    Strip common indentation, remove leading/trailing blank lines, collapse
    runs of more than one blank line to a single blank line (stanza break).
    """
    lines = [l.rstrip() for l in raw]

    # Trim leading/trailing blank lines
    while lines and not lines[0].strip():
        lines.pop(0)
    while lines and not lines[-1].strip():
        lines.pop()

    # Determine and strip common leading whitespace on content lines
    content = [l for l in lines if l.strip()]
    if content:
        indent = min(len(l) - len(l.lstrip()) for l in content)
        lines = [l[indent:] if len(l) >= indent else l for l in lines]

    # Collapse multiple consecutive blank lines to one
    out: list[str] = []
    prev_blank = False
    for l in lines:
        blank = not l.strip()
        if blank and prev_blank:
            continue
        out.append(l)
        prev_blank = blank

    # Final trim
    while out and not out[0].strip():
        out.pop(0)
    while out and not out[-1].strip():
        out.pop()

    return out

def first_content_line(lines: list[str]) -> str:
    for l in lines:
        if l.strip():
            return l.strip()
    return ""

# ---------------------------------------------------------------------------
# YAML helpers
# ---------------------------------------------------------------------------

def yaml_str(s: str) -> str:
    """Quote a string for YAML if it needs it."""
    needs_quote = (
        not s
        or s[0] in " \t"
        or s[-1] in " \t"
        or any(c in s for c in ':{}[]|>&*!,#?@`\'"')
    )
    if needs_quote:
        return '"' + s.replace("\\", "\\\\").replace('"', '\\"') + '"'
    return s

# ---------------------------------------------------------------------------
# File generation
# ---------------------------------------------------------------------------

def make_poem_file(
    poem:            dict,
    title_prefix:    str,
    poet:            str,
    collection:      str,
    collection_slug: str,
    date:            str,
    tags:            list[str],
) -> tuple[str, str]:
    """Return (filename_stem, markdown_content)."""
    title     = f"{title_prefix} {poem['number']}"
    slug      = slugify(title)
    norm      = normalize_stanzas(poem["lines"])
    abstract  = first_content_line(norm)
    tag_yaml  = "[" + ", ".join(tags) + "]"
    col_url   = f"/poetry/{collection_slug}/"

    fm = f"""\
---
title: {yaml_str(title)}
number: {poem['number']}
poet: {yaml_str(poet)}
collection: {yaml_str(collection)}
collection-url: {col_url}
date: {date}
tags: {tag_yaml}
abstract: {yaml_str(abstract)}
---

"""
    content = fm + "\n".join(norm) + "\n"
    return slug, content

def make_collection_index(
    collection:      str,
    poet:            str,
    date:            str,
    tags:            list[str],
    collection_slug: str,
    title_prefix:    str,
    poems:           list[dict],
) -> str:
    tag_yaml   = "[" + ", ".join(tags) + "]"
    count      = len(poems)
    abstract   = f"{count} poem{'s' if count != 1 else ''}"

    poem_links = "\n".join(
        f"- [{title_prefix} {p['number']}](./{slugify(title_prefix + ' ' + str(p['number']))}.html)"
        for p in sorted(poems, key=lambda p: p["number"])
    )

    return f"""\
---
title: {yaml_str(collection)}
poet: {yaml_str(poet)}
date: {date}
tags: {tag_yaml}
abstract: {yaml_str(abstract)}
---

*{poet}* · {date}

{poem_links}
"""

# ---------------------------------------------------------------------------
# Interactive review
# ---------------------------------------------------------------------------

def interactive_review(poems: list[dict], title_prefix: str) -> list[dict]:
    approved: list[dict] = []
    total = len(poems)
    for idx, poem in enumerate(poems, 1):
        title   = f"{title_prefix} {poem['number']}"
        preview = first_content_line(normalize_stanzas(poem["lines"]))
        n_lines = sum(1 for l in poem["lines"] if l.strip())

        print(f"\n{'─' * 60}")
        print(f"  [{idx}/{total}]  {poem['roman']}.  →  {title}")
        print(f"  First line : {preview}")
        print(f"  Body lines : {n_lines}")
        print()
        resp = input("  [Enter] include   s skip   q quit: ").strip().lower()
        if resp == "q":
            print("Stopped at user request.")
            break
        elif resp == "s":
            print(f"  Skipped {title}.")
            continue
        approved.append(poem)

    return approved

# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------

def main() -> None:
    parser = argparse.ArgumentParser(
        description="Import a Gutenberg poetry collection into content/poetry/."
    )
    parser.add_argument("source",            help="Path to the Gutenberg .txt file")
    parser.add_argument("--poet",            required=True, help='e.g. "William Shakespeare"')
    parser.add_argument("--collection",      required=True, help='e.g. "Sonnets"')
    parser.add_argument("--date",            required=True, help="Publication year, e.g. 1609")
    parser.add_argument("--title-prefix",    help='Per-poem title prefix, e.g. "Sonnet". Defaults to singular of --collection.')
    parser.add_argument("--tags",            default="poetry", help="Comma-separated tags (default: poetry)")
    parser.add_argument("--slug",            help="Override collection directory slug")
    parser.add_argument("--interactive",     action="store_true", help="Review each poem before writing")
    parser.add_argument("--dry-run",         action="store_true", help="Show what would be written; write nothing")
    parser.add_argument("--overwrite",       action="store_true", help="Overwrite existing files")
    args = parser.parse_args()

    source = Path(args.source)
    if not source.exists():
        print(f"error: file not found: {source}", file=sys.stderr)
        sys.exit(1)

    # Defaults
    title_prefix     = args.title_prefix or args.collection.rstrip("s")
    collection_slug  = args.slug or slugify(f"{args.poet}-{args.collection}")
    tags             = [t.strip() for t in args.tags.split(",")]
    out_dir          = POETRY_DIR / collection_slug

    text = source.read_text(encoding="utf-8", errors="replace")
    header, body = strip_gutenberg(text)

    if not body.strip():
        print("warning: Gutenberg markers not found — treating entire file as body", file=sys.stderr)
        body = text

    poems = split_poems(body)

    if not poems:
        print("No poems detected. The file may not use Roman-numeral headings.", file=sys.stderr)
        print("First 50 lines of body:", file=sys.stderr)
        for ln in body.splitlines()[:50]:
            print(f"  {repr(ln)}", file=sys.stderr)
        sys.exit(1)

    print(f"Detected {len(poems)} poems  ·  collection: {args.collection}  ·  poet: {args.poet}")

    if args.interactive:
        poems = interactive_review(poems, title_prefix)
        print(f"\n{len(poems)} poem(s) approved for import.")

    if not poems:
        print("Nothing to write.")
        return

    # Build file map
    files: dict[Path, str] = {}
    for poem in poems:
        slug, content = make_poem_file(
            poem, title_prefix, args.poet, args.collection,
            collection_slug, args.date, tags,
        )
        files[out_dir / f"{slug}.md"] = content

    files[out_dir / "index.md"] = make_collection_index(
        args.collection, args.poet, args.date, tags,
        collection_slug, title_prefix, poems,
    )

    # Dry run
    if args.dry_run:
        print(f"\nDry run — {len(files)} file(s) → {out_dir.relative_to(REPO_ROOT)}/")
        for path in sorted(files):
            marker = " (exists)" if path.exists() else ""
            print(f"  {path.name}{marker}")
        print(f"\nSample — first poem:\n{'─'*60}")
        first_content = next(v for k, v in files.items() if k.name != "index.md")
        print(first_content[:800])
        return

    # Write
    out_dir.mkdir(parents=True, exist_ok=True)
    written = skipped = 0
    for path, content in sorted(files.items()):
        if path.exists() and not args.overwrite:
            print(f"  skip  {path.name}")
            skipped += 1
        else:
            path.write_text(content, encoding="utf-8")
            print(f"  write {path.name}")
            written += 1

    print(f"\n{written} written, {skipped} skipped → {out_dir.relative_to(REPO_ROOT)}/")
    if not args.dry_run:
        print("Next: make clean && make build")

if __name__ == "__main__":
    main()