levineuwirth.org/tools/import-poetry.py

402 lines
13 KiB
Python

#!/usr/bin/env python3
"""
import-poetry.py — Import a poetry collection from a Project Gutenberg plain-text file.
Produces:
content/poetry/{collection-slug}/index.md Collection index page
content/poetry/{collection-slug}/{poem-slug}.md One file per poem
Usage:
python tools/import-poetry.py gutenberg.txt \\
--poet "William Shakespeare" \\
--collection "Sonnets" \\
--date 1609 \\
--title-prefix "Sonnet" \\
--tags poetry,english \\
[--slug shakespeare-sonnets] \\
[--interactive] \\
[--dry-run] \\
[--overwrite]
The --title-prefix controls per-poem title generation:
"Sonnet""Sonnet 1", "Sonnet 2", ..., slug "sonnet-1", "sonnet-2"
If omitted, defaults to the singular of --collection (strips trailing 's').
"""
import argparse
import re
import sys
from pathlib import Path
from typing import Optional
REPO_ROOT = Path(__file__).parent.parent
POETRY_DIR = REPO_ROOT / "content" / "poetry"
# ---------------------------------------------------------------------------
# Roman numeral conversion
# ---------------------------------------------------------------------------
_ROMAN_VALS = [
("M", 1000), ("CM", 900), ("D", 500), ("CD", 400),
("C", 100), ("XC", 90), ("L", 50), ("XL", 40),
("X", 10), ("IX", 9), ("V", 5), ("IV", 4), ("I", 1),
]
def roman_to_int(s: str) -> Optional[int]:
s = s.upper().strip()
i, result = 0, 0
for numeral, value in _ROMAN_VALS:
while s[i : i + len(numeral)] == numeral:
result += value
i += len(numeral)
return result if i == len(s) and result > 0 else None
# Matches a line that is *solely* a Roman numeral (with optional period/trailing space).
# Anchored; leading/trailing whitespace stripped by caller.
_ROMAN_RE = re.compile(
r"^(M{0,4}(?:CM|CD|D?C{0,3})(?:XC|XL|L?X{0,3})(?:IX|IV|V?I{0,3}))\.?$",
re.IGNORECASE,
)
# ---------------------------------------------------------------------------
# Slug generation
# ---------------------------------------------------------------------------
def slugify(s: str) -> str:
s = s.lower()
s = re.sub(r"[^\w\s-]", "", s)
s = re.sub(r"[\s_]+", "-", s)
s = re.sub(r"-+", "-", s)
return s.strip("-")
# ---------------------------------------------------------------------------
# Gutenberg parsing
# ---------------------------------------------------------------------------
_START_RE = re.compile(r"\*\*\* START OF THE PROJECT GUTENBERG", re.IGNORECASE)
_END_RE = re.compile(r"\*\*\* END OF THE PROJECT GUTENBERG", re.IGNORECASE)
def strip_gutenberg(text: str) -> tuple[str, str]:
"""Return (header, body) where body is the text between the PG markers."""
lines = text.splitlines()
start = 0
end = len(lines)
for i, line in enumerate(lines):
if _START_RE.search(line):
start = i + 1
break
for i, line in enumerate(lines):
if _END_RE.search(line):
end = i
break
header = "\n".join(lines[:start])
body = "\n".join(lines[start:end])
return header, body
def parse_gutenberg_meta(header: str) -> dict:
meta: dict = {}
for line in header.splitlines():
for field in ("Title", "Author", "Release date", "Release Date"):
if line.startswith(field + ":"):
meta[field.lower().replace(" ", "-")] = line.split(":", 1)[1].strip()
return meta
# ---------------------------------------------------------------------------
# Poem splitting
# ---------------------------------------------------------------------------
def split_poems(body: str) -> list[dict]:
"""
Split body text into individual poems using Roman-numeral headings as
boundaries. Returns a list of dicts:
{ number: int, roman: str, lines: list[str] }
Lines are raw — call normalize_stanzas() before writing.
"""
lines = body.splitlines()
poems: list[dict] = []
current: Optional[dict] = None
for line in lines:
stripped = line.strip()
m = _ROMAN_RE.match(stripped)
if m and stripped: # empty stripped means blank line, not a heading
number = roman_to_int(m.group(1))
if number is not None:
if current is not None and _has_content(current["lines"]):
poems.append(current)
current = {"number": number, "roman": m.group(1).upper(), "lines": []}
continue
if current is not None:
current["lines"].append(line)
if current is not None and _has_content(current["lines"]):
poems.append(current)
return poems
def _has_content(lines: list[str], min_words: int = 4) -> bool:
text = " ".join(l.strip() for l in lines if l.strip())
return len(text.split()) >= min_words
# ---------------------------------------------------------------------------
# Stanza normalization
# ---------------------------------------------------------------------------
def normalize_stanzas(raw: list[str]) -> list[str]:
"""
Strip common indentation, remove leading/trailing blank lines, collapse
runs of more than one blank line to a single blank line (stanza break).
"""
lines = [l.rstrip() for l in raw]
# Trim leading/trailing blank lines
while lines and not lines[0].strip():
lines.pop(0)
while lines and not lines[-1].strip():
lines.pop()
# Determine and strip common leading whitespace on content lines
content = [l for l in lines if l.strip()]
if content:
indent = min(len(l) - len(l.lstrip()) for l in content)
lines = [l[indent:] if len(l) >= indent else l for l in lines]
# Collapse multiple consecutive blank lines to one
out: list[str] = []
prev_blank = False
for l in lines:
blank = not l.strip()
if blank and prev_blank:
continue
out.append(l)
prev_blank = blank
# Final trim
while out and not out[0].strip():
out.pop(0)
while out and not out[-1].strip():
out.pop()
return out
def first_content_line(lines: list[str]) -> str:
for l in lines:
if l.strip():
return l.strip()
return ""
# ---------------------------------------------------------------------------
# YAML helpers
# ---------------------------------------------------------------------------
def yaml_str(s: str) -> str:
"""Quote a string for YAML if it needs it."""
needs_quote = (
not s
or s[0] in " \t"
or s[-1] in " \t"
or any(c in s for c in ':{}[]|>&*!,#?@`\'"')
)
if needs_quote:
return '"' + s.replace("\\", "\\\\").replace('"', '\\"') + '"'
return s
# ---------------------------------------------------------------------------
# File generation
# ---------------------------------------------------------------------------
def make_poem_file(
poem: dict,
title_prefix: str,
poet: str,
collection: str,
collection_slug: str,
date: str,
tags: list[str],
) -> tuple[str, str]:
"""Return (filename_stem, markdown_content)."""
title = f"{title_prefix} {poem['number']}"
slug = slugify(title)
norm = normalize_stanzas(poem["lines"])
abstract = first_content_line(norm)
tag_yaml = "[" + ", ".join(tags) + "]"
col_url = f"/poetry/{collection_slug}/"
fm = f"""\
---
title: {yaml_str(title)}
number: {poem['number']}
poet: {yaml_str(poet)}
collection: {yaml_str(collection)}
collection-url: {col_url}
date: {date}
tags: {tag_yaml}
abstract: {yaml_str(abstract)}
---
"""
content = fm + "\n".join(norm) + "\n"
return slug, content
def make_collection_index(
collection: str,
poet: str,
date: str,
tags: list[str],
collection_slug: str,
title_prefix: str,
poems: list[dict],
) -> str:
tag_yaml = "[" + ", ".join(tags) + "]"
count = len(poems)
abstract = f"{count} poem{'s' if count != 1 else ''}"
poem_links = "\n".join(
f"- [{title_prefix} {p['number']}](./{slugify(title_prefix + ' ' + str(p['number']))}.html)"
for p in sorted(poems, key=lambda p: p["number"])
)
return f"""\
---
title: {yaml_str(collection)}
poet: {yaml_str(poet)}
date: {date}
tags: {tag_yaml}
abstract: {yaml_str(abstract)}
---
*{poet}* · {date}
{poem_links}
"""
# ---------------------------------------------------------------------------
# Interactive review
# ---------------------------------------------------------------------------
def interactive_review(poems: list[dict], title_prefix: str) -> list[dict]:
approved: list[dict] = []
total = len(poems)
for idx, poem in enumerate(poems, 1):
title = f"{title_prefix} {poem['number']}"
preview = first_content_line(normalize_stanzas(poem["lines"]))
n_lines = sum(1 for l in poem["lines"] if l.strip())
print(f"\n{'' * 60}")
print(f" [{idx}/{total}] {poem['roman']}. → {title}")
print(f" First line : {preview}")
print(f" Body lines : {n_lines}")
print()
resp = input(" [Enter] include s skip q quit: ").strip().lower()
if resp == "q":
print("Stopped at user request.")
break
elif resp == "s":
print(f" Skipped {title}.")
continue
approved.append(poem)
return approved
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def main() -> None:
parser = argparse.ArgumentParser(
description="Import a Gutenberg poetry collection into content/poetry/."
)
parser.add_argument("source", help="Path to the Gutenberg .txt file")
parser.add_argument("--poet", required=True, help='e.g. "William Shakespeare"')
parser.add_argument("--collection", required=True, help='e.g. "Sonnets"')
parser.add_argument("--date", required=True, help="Publication year, e.g. 1609")
parser.add_argument("--title-prefix", help='Per-poem title prefix, e.g. "Sonnet". Defaults to singular of --collection.')
parser.add_argument("--tags", default="poetry", help="Comma-separated tags (default: poetry)")
parser.add_argument("--slug", help="Override collection directory slug")
parser.add_argument("--interactive", action="store_true", help="Review each poem before writing")
parser.add_argument("--dry-run", action="store_true", help="Show what would be written; write nothing")
parser.add_argument("--overwrite", action="store_true", help="Overwrite existing files")
args = parser.parse_args()
source = Path(args.source)
if not source.exists():
print(f"error: file not found: {source}", file=sys.stderr)
sys.exit(1)
# Defaults
title_prefix = args.title_prefix or args.collection.rstrip("s")
collection_slug = args.slug or slugify(f"{args.poet}-{args.collection}")
tags = [t.strip() for t in args.tags.split(",")]
out_dir = POETRY_DIR / collection_slug
text = source.read_text(encoding="utf-8", errors="replace")
header, body = strip_gutenberg(text)
if not body.strip():
print("warning: Gutenberg markers not found — treating entire file as body", file=sys.stderr)
body = text
poems = split_poems(body)
if not poems:
print("No poems detected. The file may not use Roman-numeral headings.", file=sys.stderr)
print("First 50 lines of body:", file=sys.stderr)
for ln in body.splitlines()[:50]:
print(f" {repr(ln)}", file=sys.stderr)
sys.exit(1)
print(f"Detected {len(poems)} poems · collection: {args.collection} · poet: {args.poet}")
if args.interactive:
poems = interactive_review(poems, title_prefix)
print(f"\n{len(poems)} poem(s) approved for import.")
if not poems:
print("Nothing to write.")
return
# Build file map
files: dict[Path, str] = {}
for poem in poems:
slug, content = make_poem_file(
poem, title_prefix, args.poet, args.collection,
collection_slug, args.date, tags,
)
files[out_dir / f"{slug}.md"] = content
files[out_dir / "index.md"] = make_collection_index(
args.collection, args.poet, args.date, tags,
collection_slug, title_prefix, poems,
)
# Dry run
if args.dry_run:
print(f"\nDry run — {len(files)} file(s) → {out_dir.relative_to(REPO_ROOT)}/")
for path in sorted(files):
marker = " (exists)" if path.exists() else ""
print(f" {path.name}{marker}")
print(f"\nSample — first poem:\n{''*60}")
first_content = next(v for k, v in files.items() if k.name != "index.md")
print(first_content[:800])
return
# Write
out_dir.mkdir(parents=True, exist_ok=True)
written = skipped = 0
for path, content in sorted(files.items()):
if path.exists() and not args.overwrite:
print(f" skip {path.name}")
skipped += 1
else:
path.write_text(content, encoding="utf-8")
print(f" write {path.name}")
written += 1
print(f"\n{written} written, {skipped} skipped → {out_dir.relative_to(REPO_ROOT)}/")
if not args.dry_run:
print("Next: make clean && make build")
if __name__ == "__main__":
main()