402 lines
13 KiB
Python
402 lines
13 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
import-poetry.py — Import a poetry collection from a Project Gutenberg plain-text file.
|
|
|
|
Produces:
|
|
content/poetry/{collection-slug}/index.md Collection index page
|
|
content/poetry/{collection-slug}/{poem-slug}.md One file per poem
|
|
|
|
Usage:
|
|
python tools/import-poetry.py gutenberg.txt \\
|
|
--poet "William Shakespeare" \\
|
|
--collection "Sonnets" \\
|
|
--date 1609 \\
|
|
--title-prefix "Sonnet" \\
|
|
--tags poetry,english \\
|
|
[--slug shakespeare-sonnets] \\
|
|
[--interactive] \\
|
|
[--dry-run] \\
|
|
[--overwrite]
|
|
|
|
The --title-prefix controls per-poem title generation:
|
|
"Sonnet" → "Sonnet 1", "Sonnet 2", ..., slug "sonnet-1", "sonnet-2"
|
|
If omitted, defaults to the singular of --collection (strips trailing 's').
|
|
"""
|
|
|
|
import argparse
|
|
import re
|
|
import sys
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
|
|
REPO_ROOT = Path(__file__).parent.parent
|
|
POETRY_DIR = REPO_ROOT / "content" / "poetry"
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Roman numeral conversion
|
|
# ---------------------------------------------------------------------------
|
|
|
|
_ROMAN_VALS = [
|
|
("M", 1000), ("CM", 900), ("D", 500), ("CD", 400),
|
|
("C", 100), ("XC", 90), ("L", 50), ("XL", 40),
|
|
("X", 10), ("IX", 9), ("V", 5), ("IV", 4), ("I", 1),
|
|
]
|
|
|
|
def roman_to_int(s: str) -> Optional[int]:
|
|
s = s.upper().strip()
|
|
i, result = 0, 0
|
|
for numeral, value in _ROMAN_VALS:
|
|
while s[i : i + len(numeral)] == numeral:
|
|
result += value
|
|
i += len(numeral)
|
|
return result if i == len(s) and result > 0 else None
|
|
|
|
# Matches a line that is *solely* a Roman numeral (with optional period/trailing space).
|
|
# Anchored; leading/trailing whitespace stripped by caller.
|
|
_ROMAN_RE = re.compile(
|
|
r"^(M{0,4}(?:CM|CD|D?C{0,3})(?:XC|XL|L?X{0,3})(?:IX|IV|V?I{0,3}))\.?$",
|
|
re.IGNORECASE,
|
|
)
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Slug generation
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def slugify(s: str) -> str:
|
|
s = s.lower()
|
|
s = re.sub(r"[^\w\s-]", "", s)
|
|
s = re.sub(r"[\s_]+", "-", s)
|
|
s = re.sub(r"-+", "-", s)
|
|
return s.strip("-")
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Gutenberg parsing
|
|
# ---------------------------------------------------------------------------
|
|
|
|
_START_RE = re.compile(r"\*\*\* START OF THE PROJECT GUTENBERG", re.IGNORECASE)
|
|
_END_RE = re.compile(r"\*\*\* END OF THE PROJECT GUTENBERG", re.IGNORECASE)
|
|
|
|
def strip_gutenberg(text: str) -> tuple[str, str]:
|
|
"""Return (header, body) where body is the text between the PG markers."""
|
|
lines = text.splitlines()
|
|
start = 0
|
|
end = len(lines)
|
|
for i, line in enumerate(lines):
|
|
if _START_RE.search(line):
|
|
start = i + 1
|
|
break
|
|
for i, line in enumerate(lines):
|
|
if _END_RE.search(line):
|
|
end = i
|
|
break
|
|
header = "\n".join(lines[:start])
|
|
body = "\n".join(lines[start:end])
|
|
return header, body
|
|
|
|
def parse_gutenberg_meta(header: str) -> dict:
|
|
meta: dict = {}
|
|
for line in header.splitlines():
|
|
for field in ("Title", "Author", "Release date", "Release Date"):
|
|
if line.startswith(field + ":"):
|
|
meta[field.lower().replace(" ", "-")] = line.split(":", 1)[1].strip()
|
|
return meta
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Poem splitting
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def split_poems(body: str) -> list[dict]:
|
|
"""
|
|
Split body text into individual poems using Roman-numeral headings as
|
|
boundaries. Returns a list of dicts:
|
|
{ number: int, roman: str, lines: list[str] }
|
|
|
|
Lines are raw — call normalize_stanzas() before writing.
|
|
"""
|
|
lines = body.splitlines()
|
|
poems: list[dict] = []
|
|
current: Optional[dict] = None
|
|
|
|
for line in lines:
|
|
stripped = line.strip()
|
|
m = _ROMAN_RE.match(stripped)
|
|
if m and stripped: # empty stripped means blank line, not a heading
|
|
number = roman_to_int(m.group(1))
|
|
if number is not None:
|
|
if current is not None and _has_content(current["lines"]):
|
|
poems.append(current)
|
|
current = {"number": number, "roman": m.group(1).upper(), "lines": []}
|
|
continue
|
|
if current is not None:
|
|
current["lines"].append(line)
|
|
|
|
if current is not None and _has_content(current["lines"]):
|
|
poems.append(current)
|
|
|
|
return poems
|
|
|
|
def _has_content(lines: list[str], min_words: int = 4) -> bool:
|
|
text = " ".join(l.strip() for l in lines if l.strip())
|
|
return len(text.split()) >= min_words
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Stanza normalization
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def normalize_stanzas(raw: list[str]) -> list[str]:
|
|
"""
|
|
Strip common indentation, remove leading/trailing blank lines, collapse
|
|
runs of more than one blank line to a single blank line (stanza break).
|
|
"""
|
|
lines = [l.rstrip() for l in raw]
|
|
|
|
# Trim leading/trailing blank lines
|
|
while lines and not lines[0].strip():
|
|
lines.pop(0)
|
|
while lines and not lines[-1].strip():
|
|
lines.pop()
|
|
|
|
# Determine and strip common leading whitespace on content lines
|
|
content = [l for l in lines if l.strip()]
|
|
if content:
|
|
indent = min(len(l) - len(l.lstrip()) for l in content)
|
|
lines = [l[indent:] if len(l) >= indent else l for l in lines]
|
|
|
|
# Collapse multiple consecutive blank lines to one
|
|
out: list[str] = []
|
|
prev_blank = False
|
|
for l in lines:
|
|
blank = not l.strip()
|
|
if blank and prev_blank:
|
|
continue
|
|
out.append(l)
|
|
prev_blank = blank
|
|
|
|
# Final trim
|
|
while out and not out[0].strip():
|
|
out.pop(0)
|
|
while out and not out[-1].strip():
|
|
out.pop()
|
|
|
|
return out
|
|
|
|
def first_content_line(lines: list[str]) -> str:
|
|
for l in lines:
|
|
if l.strip():
|
|
return l.strip()
|
|
return ""
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# YAML helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def yaml_str(s: str) -> str:
|
|
"""Quote a string for YAML if it needs it."""
|
|
needs_quote = (
|
|
not s
|
|
or s[0] in " \t"
|
|
or s[-1] in " \t"
|
|
or any(c in s for c in ':{}[]|>&*!,#?@`\'"')
|
|
)
|
|
if needs_quote:
|
|
return '"' + s.replace("\\", "\\\\").replace('"', '\\"') + '"'
|
|
return s
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# File generation
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def make_poem_file(
|
|
poem: dict,
|
|
title_prefix: str,
|
|
poet: str,
|
|
collection: str,
|
|
collection_slug: str,
|
|
date: str,
|
|
tags: list[str],
|
|
) -> tuple[str, str]:
|
|
"""Return (filename_stem, markdown_content)."""
|
|
title = f"{title_prefix} {poem['number']}"
|
|
slug = slugify(title)
|
|
norm = normalize_stanzas(poem["lines"])
|
|
abstract = first_content_line(norm)
|
|
tag_yaml = "[" + ", ".join(tags) + "]"
|
|
col_url = f"/poetry/{collection_slug}/"
|
|
|
|
fm = f"""\
|
|
---
|
|
title: {yaml_str(title)}
|
|
number: {poem['number']}
|
|
poet: {yaml_str(poet)}
|
|
collection: {yaml_str(collection)}
|
|
collection-url: {col_url}
|
|
date: {date}
|
|
tags: {tag_yaml}
|
|
abstract: {yaml_str(abstract)}
|
|
---
|
|
|
|
"""
|
|
content = fm + "\n".join(norm) + "\n"
|
|
return slug, content
|
|
|
|
def make_collection_index(
|
|
collection: str,
|
|
poet: str,
|
|
date: str,
|
|
tags: list[str],
|
|
collection_slug: str,
|
|
title_prefix: str,
|
|
poems: list[dict],
|
|
) -> str:
|
|
tag_yaml = "[" + ", ".join(tags) + "]"
|
|
count = len(poems)
|
|
abstract = f"{count} poem{'s' if count != 1 else ''}"
|
|
|
|
poem_links = "\n".join(
|
|
f"- [{title_prefix} {p['number']}](./{slugify(title_prefix + ' ' + str(p['number']))}.html)"
|
|
for p in sorted(poems, key=lambda p: p["number"])
|
|
)
|
|
|
|
return f"""\
|
|
---
|
|
title: {yaml_str(collection)}
|
|
poet: {yaml_str(poet)}
|
|
date: {date}
|
|
tags: {tag_yaml}
|
|
abstract: {yaml_str(abstract)}
|
|
---
|
|
|
|
*{poet}* · {date}
|
|
|
|
{poem_links}
|
|
"""
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Interactive review
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def interactive_review(poems: list[dict], title_prefix: str) -> list[dict]:
|
|
approved: list[dict] = []
|
|
total = len(poems)
|
|
for idx, poem in enumerate(poems, 1):
|
|
title = f"{title_prefix} {poem['number']}"
|
|
preview = first_content_line(normalize_stanzas(poem["lines"]))
|
|
n_lines = sum(1 for l in poem["lines"] if l.strip())
|
|
|
|
print(f"\n{'─' * 60}")
|
|
print(f" [{idx}/{total}] {poem['roman']}. → {title}")
|
|
print(f" First line : {preview}")
|
|
print(f" Body lines : {n_lines}")
|
|
print()
|
|
resp = input(" [Enter] include s skip q quit: ").strip().lower()
|
|
if resp == "q":
|
|
print("Stopped at user request.")
|
|
break
|
|
elif resp == "s":
|
|
print(f" Skipped {title}.")
|
|
continue
|
|
approved.append(poem)
|
|
|
|
return approved
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Main
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser(
|
|
description="Import a Gutenberg poetry collection into content/poetry/."
|
|
)
|
|
parser.add_argument("source", help="Path to the Gutenberg .txt file")
|
|
parser.add_argument("--poet", required=True, help='e.g. "William Shakespeare"')
|
|
parser.add_argument("--collection", required=True, help='e.g. "Sonnets"')
|
|
parser.add_argument("--date", required=True, help="Publication year, e.g. 1609")
|
|
parser.add_argument("--title-prefix", help='Per-poem title prefix, e.g. "Sonnet". Defaults to singular of --collection.')
|
|
parser.add_argument("--tags", default="poetry", help="Comma-separated tags (default: poetry)")
|
|
parser.add_argument("--slug", help="Override collection directory slug")
|
|
parser.add_argument("--interactive", action="store_true", help="Review each poem before writing")
|
|
parser.add_argument("--dry-run", action="store_true", help="Show what would be written; write nothing")
|
|
parser.add_argument("--overwrite", action="store_true", help="Overwrite existing files")
|
|
args = parser.parse_args()
|
|
|
|
source = Path(args.source)
|
|
if not source.exists():
|
|
print(f"error: file not found: {source}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
# Defaults
|
|
title_prefix = args.title_prefix or args.collection.rstrip("s")
|
|
collection_slug = args.slug or slugify(f"{args.poet}-{args.collection}")
|
|
tags = [t.strip() for t in args.tags.split(",")]
|
|
out_dir = POETRY_DIR / collection_slug
|
|
|
|
text = source.read_text(encoding="utf-8", errors="replace")
|
|
header, body = strip_gutenberg(text)
|
|
|
|
if not body.strip():
|
|
print("warning: Gutenberg markers not found — treating entire file as body", file=sys.stderr)
|
|
body = text
|
|
|
|
poems = split_poems(body)
|
|
|
|
if not poems:
|
|
print("No poems detected. The file may not use Roman-numeral headings.", file=sys.stderr)
|
|
print("First 50 lines of body:", file=sys.stderr)
|
|
for ln in body.splitlines()[:50]:
|
|
print(f" {repr(ln)}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
print(f"Detected {len(poems)} poems · collection: {args.collection} · poet: {args.poet}")
|
|
|
|
if args.interactive:
|
|
poems = interactive_review(poems, title_prefix)
|
|
print(f"\n{len(poems)} poem(s) approved for import.")
|
|
|
|
if not poems:
|
|
print("Nothing to write.")
|
|
return
|
|
|
|
# Build file map
|
|
files: dict[Path, str] = {}
|
|
for poem in poems:
|
|
slug, content = make_poem_file(
|
|
poem, title_prefix, args.poet, args.collection,
|
|
collection_slug, args.date, tags,
|
|
)
|
|
files[out_dir / f"{slug}.md"] = content
|
|
|
|
files[out_dir / "index.md"] = make_collection_index(
|
|
args.collection, args.poet, args.date, tags,
|
|
collection_slug, title_prefix, poems,
|
|
)
|
|
|
|
# Dry run
|
|
if args.dry_run:
|
|
print(f"\nDry run — {len(files)} file(s) → {out_dir.relative_to(REPO_ROOT)}/")
|
|
for path in sorted(files):
|
|
marker = " (exists)" if path.exists() else ""
|
|
print(f" {path.name}{marker}")
|
|
print(f"\nSample — first poem:\n{'─'*60}")
|
|
first_content = next(v for k, v in files.items() if k.name != "index.md")
|
|
print(first_content[:800])
|
|
return
|
|
|
|
# Write
|
|
out_dir.mkdir(parents=True, exist_ok=True)
|
|
written = skipped = 0
|
|
for path, content in sorted(files.items()):
|
|
if path.exists() and not args.overwrite:
|
|
print(f" skip {path.name}")
|
|
skipped += 1
|
|
else:
|
|
path.write_text(content, encoding="utf-8")
|
|
print(f" write {path.name}")
|
|
written += 1
|
|
|
|
print(f"\n{written} written, {skipped} skipped → {out_dir.relative_to(REPO_ROOT)}/")
|
|
if not args.dry_run:
|
|
print("Next: make clean && make build")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|