#!/usr/bin/env python3
# Copyright (C) 2020-2026 by Haim Bar and HaiYing Wang
# Part of the runcode package 2026/06/13 v2.6
# https://github.com/Ossifragus/runcode
#
# This file may be distributed and/or modified under the conditions of
# the LaTeX Project Public License, either version 1.3c of this license
# or (at your option) any later version.
# The latest version of this license is in http://www.latex-project.org/lppl.txt

"""consolidate.py — produce a self-contained LaTeX project from a runcode document.

After a successful build (all code executed, outputs cached in generated/),
this script:
  1. Copies the project to a standalone output directory.
  2. Transforms all .tex files: replaces runcode output commands with their
     cached content so the result compiles without talk2stat or any language
     runtime.
  3. Replaces \\usepackage{runcode} with a minimal shim (tcolorbox + listings).
  4. Optionally compiles the result.

Supported commands (all language variants — R, Python, Julia, MatLab):
  \\runR / \\runPython / ...        removed (output shown via \\includeOutput)
  \\runRIncOut / ...                replaced with cached content
  \\runRChunk / ...                 replaced with cached content
  \\includeOutput{label}[type]      replaced with cached content
  \\inlnR{code}[label][type] / ... replaced with cached content
  \\inln{cmd}{code}[label][type]    replaced with cached content
  \\showCode / \\showChunk          kept; minimal shim uses lstinputlisting

Auto-numbered outputs (no explicit label) are left as-is; review them manually.

Motivated by the Overleaf use case (Overleaf cannot run talk2stat) but the
output is general: any recipient can compile it with plain pdflatex.

Usage:
  python3 consolidate.py [OPTIONS] MAIN.tex

Options:
  --out DIR        output directory (default: standalone)
  --engine CMD     LaTeX engine for compilation (default: pdflatex)
  --no-compile     copy and transform only; skip compilation
  --exclude GLOB   additional glob pattern to exclude (repeatable)
"""

import argparse
import re
import shutil
import subprocess
import sys
from fnmatch import fnmatch
from pathlib import Path


# ── Default copy excludes ──────────────────────────────────────────────────────

DEFAULT_EXCLUDES = frozenset({
    "*.aux", "*.log", "*.out", "*.toc", "*.bbl", "*.blg",
    "*.idx", "*.ilg", "*.ind", "*.mw", "*.xdv", "*.tbc",
    "*.fls", "*.fdb_latexmk", "*.synctex.gz",
    "nohup.out", "serverPID*.txt", "*debug.txt", "talk2stat.log",
    "*.stale", "*.md5",
})


# ── Shim inserted in place of \usepackage[...]{runcode} ───────────────────────

RUNCODE_SHIM = r"""\usepackage{tcolorbox}
\tcbuselibrary{breakable,skins}
\usepackage{listings}
\usepackage{xparse}
% consolidate.py shim: replaces \usepackage{runcode} for standalone compilation
% Display commands (functional via listings):
\NewDocumentCommand{\showCode}{m m O{} O{}}{\lstinputlisting{#2}}
\NewDocumentCommand{\showChunk}{m m m O{} O{}}{\lstinputlisting{generated/#2-#3.txt}}
% Execution commands (no-ops: outputs already inlined by consolidate.py):
\NewDocumentCommand{\runExtCode}{m m m O{}}{}
\NewDocumentCommand{\runCodeIncOut}{m m O{} O{} O{vbox}}{}
\NewDocumentCommand{\runR}{O{} m m O{}}{}
\NewDocumentCommand{\runRIncOut}{O{} m O{} O{} O{vbox}}{}
\NewDocumentCommand{\runRChunk}{O{} m m O{} O{} O{vbox}}{}
\NewDocumentCommand{\runPython}{O{} m m O{}}{}
\NewDocumentCommand{\runPythonIncOut}{O{} m O{} O{} O{vbox}}{}
\NewDocumentCommand{\runPythonChunk}{O{} m m O{} O{} O{vbox}}{}
\NewDocumentCommand{\runJulia}{O{} m m O{}}{}
\NewDocumentCommand{\runJuliaIncOut}{O{} m O{} O{} O{vbox}}{}
\NewDocumentCommand{\runJuliaChunk}{O{} m m O{} O{} O{vbox}}{}
\NewDocumentCommand{\runMatLab}{O{} m m O{}}{}
\NewDocumentCommand{\runMatLabIncOut}{O{} m O{} O{} O{vbox}}{}
\NewDocumentCommand{\runMatLabChunk}{O{} m m O{} O{} O{vbox}}{}
\NewDocumentCommand{\includeOutput}{m O{vbox}}{}
\NewDocumentCommand{\inln}{m m O{} O{inline}}{}
\NewDocumentCommand{\inlnR}{O{} m O{} O{inline}}{}
\NewDocumentCommand{\inlnPython}{O{} m O{} O{inline}}{}
\NewDocumentCommand{\inlnJulia}{O{} m O{} O{inline}}{}
\NewDocumentCommand{\inlnMatLab}{O{} m O{} O{inline}}{}"""


# ── Argument parser helpers ────────────────────────────────────────────────────

def find_closing_brace(text: str, pos: int) -> int:
    """Return index of } matching { at text[pos], or -1."""
    depth = 0
    for i in range(pos, len(text)):
        if text[i] == "{":
            depth += 1
        elif text[i] == "}":
            depth -= 1
            if depth == 0:
                return i
    return -1


def read_optional_arg(text: str, pos: int) -> tuple:
    """Consume [arg] at text[pos:] (skipping whitespace). Returns (content, new_pos)."""
    p = pos
    while p < len(text) and text[p] in " \t\n":
        p += 1
    if p >= len(text) or text[p] != "[":
        return None, pos
    end = text.find("]", p + 1)
    if end == -1:
        return None, pos
    return text[p + 1:end], end + 1


def read_mandatory_arg(text: str, pos: int) -> tuple:
    """Consume {arg} at text[pos:] (skipping whitespace). Returns (content, new_pos)."""
    p = pos
    while p < len(text) and text[p] in " \t\n":
        p += 1
    if p >= len(text) or text[p] != "{":
        return None, pos
    end = find_closing_brace(text, p)
    if end == -1:
        return None, pos
    return text[p + 1:end], end + 1


def read_args(text: str, pos: int, sig: str) -> tuple:
    """
    Parse args per signature: 'm' = mandatory {}, 'O' = optional [].
    Returns (args_list, new_pos). On mandatory failure returns ([], original_pos).
    """
    args = []
    p = pos
    for spec in sig:
        if spec == "m":
            val, new_p = read_mandatory_arg(text, p)
            if val is None:
                return [], pos
            args.append(val)
            p = new_p
        else:  # 'O'
            val, new_p = read_optional_arg(text, p)
            args.append(val)
            if val is not None:
                p = new_p
    return args, p


# ── Command catalog ────────────────────────────────────────────────────────────
#
# Each entry: (commands, signature, handler, label_index)
#
# handler:
#   "drop"   — remove the entire command (run-only, no display)
#   "inline" — replace with cached content; label at label_index in args list
#
# Longer command names must appear before shorter prefixes of themselves in the
# list so that the prefix-check guard (rest.isalpha()) works correctly.

CATALOG = [
    # IncOut: run + display — replace with cached output
    # \runRIncOut[cmd]{src}[displayopts][label][type]  sig: O m O O O
    # label_idx=3; None → auto-numbered (left as-is)
    ({"\\runRIncOut", "\\runPythonIncOut", "\\runJuliaIncOut", "\\runMatLabIncOut"},
     "OmOOO", "inline", 3),

    # Chunk: run chunk + display — replace with cached output
    # \runRChunk[cmd]{src}{chunk}[displayopts][label][type]  sig: O m m O O O
    # handler="chunk": label derived as src-chunk when args[4] is absent
    ({"\\runRChunk", "\\runPythonChunk", "\\runJuliaChunk", "\\runMatLabChunk"},
     "OmmOOO", "chunk", 4),

    # runCodeIncOut: {cmd}{src}[displayopts][label][type]  sig: m m O O O
    ({"\\runCodeIncOut"},
     "mmOOO", "inline", 3),

    # includeOutput: {label}[type]  sig: m O
    ({"\\includeOutput"},
     "mO", "inline", 0),

    # inln (base): {cmd}{code}[label][type]  sig: m m O O
    ({"\\inln"},
     "mmOO", "inline", 2),

    # inlnLANG: [cmd]{code}[label][type]  sig: O m O O
    ({"\\inlnR", "\\inlnPython", "\\inlnJulia", "\\inlnMatLab"},
     "OmOO", "inline", 2),

    # Run-only: just drop (output shown separately via \includeOutput)
    # \runR[cmd]{src}{label}[opts]  sig: O m m O
    ({"\\runR", "\\runPython", "\\runJulia", "\\runMatLab", "\\runExtCode"},
     "OmmO", "drop", -1),
]


# ── Cache reader ───────────────────────────────────────────────────────────────

def read_cached(generated_dir: Path, label: str) -> str | None:
    """Return content of generated/label.tex, or None if missing, '' if empty."""
    path = generated_dir / (label + ".tex")
    if not path.exists():
        return None
    if path.stat().st_size <= 1:
        return ""
    return path.read_text(encoding="utf-8", errors="replace")


def wrap_vbox(content: str) -> str:
    return (
        "\\begin{tcolorbox}\n"
        "\\begin{verbatim}\n"
        + content.rstrip("\n")
        + "\n\\end{verbatim}\n"
        "\\end{tcolorbox}"
    )


def render_cached(generated_dir: Path, label: str, display_type: str | None, original: str) -> str:
    """Return the replacement string for a command with the given label."""
    cached = read_cached(generated_dir, label)
    if cached is None:
        return original          # no cache: keep original command
    if not cached.strip():
        return ""
    dtype = (display_type or "vbox").strip()
    if dtype.startswith("vbox"):
        return wrap_vbox(cached)
    if dtype == "tex":
        return cached
    return cached.strip()        # inline


# ── Single-command transformer ─────────────────────────────────────────────────

def try_replace(text: str, i: int, generated_dir: Path) -> tuple:
    """
    Try to match and replace a runcode command starting at text[i].
    Returns (replacement, new_i) or (None, i) on no match.
    """
    if text[i] != "\\":
        return None, i

    for cmd_set, sig, handler, label_idx in CATALOG:
        for cmd in cmd_set:
            clen = len(cmd)
            if text[i:i + clen] != cmd:
                continue
            # Ensure we matched the full command name (not a prefix of a longer one)
            nxt = text[i + clen: i + clen + 1]
            if nxt and (nxt.isalpha() or nxt == "@"):
                continue

            args, new_i = read_args(text, i + clen, sig)
            if not args and sig:
                continue          # parse failed; try next command

            original = text[i:new_i]

            if handler == "drop":
                return "", new_i

            if handler == "chunk":
                # Label is explicit (args[4]) or derived as src-chunk (args[1]-args[2])
                label = (args[4] if len(args) > 4 and args[4] else None) or \
                        f"{args[1]}-{args[2]}"
                dtype = args[5] if len(args) > 5 else None
                return render_cached(generated_dir, label, dtype, original), new_i

            # handler == "inline"
            label = args[label_idx] if label_idx < len(args) else None
            if not label:
                return None, i    # auto-numbered: leave as-is

            # display type is the arg after the label, if present
            dtype_idx = label_idx + 1
            dtype = args[dtype_idx] if dtype_idx < len(args) else None

            return render_cached(generated_dir, label, dtype, original), new_i

    return None, i


# ── File transformer ───────────────────────────────────────────────────────────

def transform_tex(text: str, generated_dir: Path) -> str:
    """Replace runcode commands in text with cached content."""
    result: list[str] = []
    i = 0
    n = len(text)

    while i < n:
        ch = text[i]

        # Copy % comments verbatim (don't substitute inside them)
        if ch == "%" and (i == 0 or text[i - 1] != "\\"):
            eol = text.find("\n", i)
            if eol == -1:
                result.append(text[i:])
                break
            result.append(text[i:eol + 1])
            i = eol + 1
            continue

        if ch != "\\":
            result.append(ch)
            i += 1
            continue

        repl, new_i = try_replace(text, i, generated_dir)
        if repl is not None:
            result.append(repl)
            i = new_i
            continue

        result.append(ch)
        i += 1

    return "".join(result)


_USEPACKAGE_RE = re.compile(r"\\usepackage(\[[^\]]*\])?\{[^}]*runcode\}")

def patch_usepackage(text: str) -> str:
    """Replace \\usepackage[...]{...runcode} with the shim, skipping comment lines."""
    lines = text.split("\n")
    for idx, line in enumerate(lines):
        if line.lstrip().startswith("%"):
            continue
        lines[idx] = _USEPACKAGE_RE.sub(lambda _: RUNCODE_SHIM, line)
    return "\n".join(lines)


# ── Project copy ───────────────────────────────────────────────────────────────

def excluded(path: Path, project_dir: Path, out_name: str, extra: list) -> bool:
    rel = path.relative_to(project_dir)
    if rel.parts and rel.parts[0] in {out_name, ".git"}:
        return True
    name = path.name
    for pat in list(DEFAULT_EXCLUDES) + extra:
        if fnmatch(name, pat):
            return True
    return False


def copy_project(project_dir: Path, out_dir: Path, extra_excludes: list) -> None:
    if out_dir.exists():
        shutil.rmtree(out_dir)
    out_dir.mkdir()
    for src in sorted(project_dir.rglob("*")):
        if excluded(src, project_dir, out_dir.name, extra_excludes):
            continue
        dst = out_dir / src.relative_to(project_dir)
        if src.is_dir():
            dst.mkdir(parents=True, exist_ok=True)
        else:
            dst.parent.mkdir(parents=True, exist_ok=True)
            shutil.copy2(src, dst)


# ── Tex-file transformer ───────────────────────────────────────────────────────

def transform_all_tex(out_dir: Path, generated_dir: Path) -> None:
    for tex_file in sorted(out_dir.rglob("*.tex")):
        original = tex_file.read_text(encoding="utf-8", errors="replace")
        text = patch_usepackage(original)
        text = transform_tex(text, generated_dir)
        if text != original:
            tex_file.write_text(text, encoding="utf-8")
            print(f"  transformed: {tex_file.relative_to(out_dir)}")
        else:
            print(f"  unchanged:   {tex_file.relative_to(out_dir)}")


# ── Compilation ────────────────────────────────────────────────────────────────

def compile_pdf(out_dir: Path, main: str, engine: str) -> bool:
    for pass_n in range(1, 3):
        cmd = [engine, "-shell-escape", "-interaction=nonstopmode", f"{main}.tex"]
        print(f"  pass {pass_n}: {' '.join(cmd)}")
        subprocess.run(cmd, cwd=out_dir)
        pdf = out_dir / f"{main}.pdf"
        if not (pdf.exists() and pdf.stat().st_size > 0):
            log = out_dir / f"{main}.log"
            print(f"  ERROR: no PDF after pass {pass_n}")
            if log.exists():
                lines = log.read_text(errors="replace").splitlines()
                print("\n".join(lines[-50:]))
            return False
    pdf = out_dir / f"{main}.pdf"
    print(f"  OK — {pdf} ({pdf.stat().st_size // 1024} KB)")
    return True


# ── Main ───────────────────────────────────────────────────────────────────────

def main() -> None:
    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter,
    )
    parser.add_argument("main", metavar="MAIN.tex")
    parser.add_argument("--out", default="standalone", metavar="DIR")
    parser.add_argument("--engine", default="pdflatex", metavar="CMD")
    parser.add_argument("--no-compile", action="store_true")
    parser.add_argument("--exclude", action="append", default=[], metavar="GLOB")
    args = parser.parse_args()

    main_tex = Path(args.main).resolve()
    if not main_tex.exists():
        sys.exit(f"ERROR: {args.main} not found")

    project_dir = main_tex.parent
    out_dir = (project_dir / args.out).resolve()
    generated_dir = project_dir / "generated"

    if not generated_dir.exists():
        sys.exit("ERROR: generated/ not found — run a full build first")

    print(f"── Step 1: copy project → {out_dir.name}/ ─────────────────────────────────")
    copy_project(project_dir, out_dir, args.exclude)

    print(f"\n── Step 2: inline cached outputs ──────────────────────────────────────────")
    transform_all_tex(out_dir, generated_dir)

    if not args.no_compile:
        print(f"\n── Step 3: compile {main_tex.stem}.tex ─────────────────────────────────────")
        if not compile_pdf(out_dir, main_tex.stem, args.engine):
            sys.exit("Compilation failed.")

    print(f"\nDone. Standalone project in: {out_dir}/")
    print("Review any auto-numbered \\inln*/\\includeOutput commands manually.")


if __name__ == "__main__":
    main()
