diff --git a/.example.env b/.example.env new file mode 100644 index 0000000..103f79c --- /dev/null +++ b/.example.env @@ -0,0 +1,6 @@ +# From Litellm +GOOGLE_API_KEY="your_actual_google_api_key_here" +MODEL=gemini/gemini-2.0-flash + +# additional +TEMPERATURE=0.2 diff --git a/.gitignore b/.gitignore index 78048ea..70f81ef 100644 --- a/.gitignore +++ b/.gitignore @@ -208,4 +208,7 @@ __marimo__/ **/*pb2*.py -department_report.md \ No newline at end of file +department_report.md +*.exe +*.bin +gen_* diff --git a/.vscode/extensions.json b/.vscode/extensions.json new file mode 100644 index 0000000..7478fbd --- /dev/null +++ b/.vscode/extensions.json @@ -0,0 +1,6 @@ +{ + "recommendations": [ + "ms-python.python", + "ms-python.vscode-pylance" + ] +} \ No newline at end of file diff --git a/README.md b/README.md index 8b4692c..b6dea76 100644 --- a/README.md +++ b/README.md @@ -4,11 +4,7 @@ CLI tool to consolidate individual weekly member reports (Word / Markdown) into ## Features -* Parse member report files named like: `{member_name}工作報告-YYYYMMDD.docx|doc|md`. -* Convert `.docx` to Markdown using `mammoth`. -* Optional examples folder with both historical member reports and an existing team report (file name contains `之工作報告`). These are packaged into few-shot examples. -* Builds a structured prompt and calls a configured model (default `gemini/gemini-1.5-flash`, changeable) through `litellm`. -* Outputs a consolidated Markdown report with standardized sections. +check [use_cases.md](docs/use_cases.md) for details. ## Environment @@ -25,15 +21,23 @@ You can also use: `GEMINI_API_KEY`, `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, `AZUR ## Usage ```bash -gen-report --source ./this_week -e ./history1 -e ./history2 --out dept_report_20240821.md \ +uv run python -m gen_report -h +uv run python -m gen_report --source ./this_week --examples ./history1 -e ./history2 --out dept_report_20240821.md \ --model gemini/gemini-1.5-flash ``` Dry run (print prompt only, no LLM call): ```bash -gen-report --source ./this_week -e ./history1 -e ./history2 --dry-run -gen-report --source ./this_week -e ./history1 --dry-run +uv run python -m gen_report -s ./this_week -e ./history1 -e ./history2 --dry-run +uv run python -m gen_report -s ./this_week -e ./history1 --dry-run +``` + +### build(not supported yet) + +```bash +uv run nuitka --onefile --assume-yes-for-downloads src/gen_report # take a long time +./gen_report.exe -h ``` ## Folder Layout Expectations @@ -57,5 +61,5 @@ history2/ Run locally without install: ```bash -python -m gen_report --source ./this_week --dry-run +uv run python -m gen_report -s ./this_week --dry-run ``` diff --git a/pyproject.toml b/pyproject.toml index ea4d837..77c29b7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,3 +19,9 @@ gen-report = "gen_report:main" [build-system] requires = ["uv_build>=0.8.12,<0.9.0"] build-backend = "uv_build" + +[dependency-groups] +dev = [ + "nuitka>=2.7.13", + "ruff>=0.12.11", +] diff --git a/src/gen_report/__init__.py b/src/gen_report/__init__.py index b3f98cb..e69de29 100644 --- a/src/gen_report/__init__.py +++ b/src/gen_report/__init__.py @@ -1,289 +0,0 @@ -from __future__ import annotations - -import argparse -import asyncio -import os -import traceback -import re -from dataclasses import dataclass -from pathlib import Path -from typing import Iterable, List, Optional, Any, cast, Union, Dict -from dotenv import load_dotenv - -from litellm import acompletion - -import mammoth # type: ignore - - -MEMBER_REPORT_REGEX = re.compile(r"^(?P.+?)工作報告-(?P\d{8})\.(?Pdocx|doc|md)$") -TEAM_REPORT_HINT = "之工作報告" - - -@dataclass -class ExampleWeek: - date: str - member_reports: List[str] - team_report: str - - -def convert_docx_to_markdown(path: Path) -> str: - """Convert a .docx file to markdown text. - - If mammoth is unavailable or conversion fails, returns an empty string. - """ - if mammoth is None: - return "" - try: - with path.open("rb") as f: - result = mammoth.convert_to_markdown(f) - return result.value.strip() - except Exception: - return "" - - -def load_markdown_from_file(path: Path) -> str: - ext = path.suffix.lower() - if ext == ".md": - try: - return path.read_text(encoding="utf-8").strip() - except Exception: - return "" - if ext in {".docx", ".doc"}: - return convert_docx_to_markdown(path) - return "" - - -def iter_member_reports(folder: Path) -> Iterable[tuple[str, str, Path]]: - for p in folder.glob("*"): - if not p.is_file(): - continue - m = MEMBER_REPORT_REGEX.match(p.name) - if not m: - continue - name = m.group("name") - date = m.group("date") - yield name, date, p - - -def collect_examples(examples_dir: Optional[Path]) -> List[ExampleWeek]: - if not examples_dir or not examples_dir.exists(): - return [] - # Group by date extracted from member reports; then look for a team report file containing hint & date range. - by_date: dict[str, List[str]] = {} - for name, date, path in iter_member_reports(examples_dir): - text = load_markdown_from_file(path) - if not text: - continue - by_date.setdefault(date, []).append(f"# {name}\n\n{text}") - - weeks: List[ExampleWeek] = [] - # Find potential team reports - team_candidates = [ - p for p in examples_dir.glob("*") if p.is_file() and TEAM_REPORT_HINT in p.name - ] - for date, member_chunks in by_date.items(): - team_text = "" - for cand in team_candidates: - # Heuristic: if date substring appears in file name - if date in cand.name: - team_text = load_markdown_from_file(cand) - break - if not team_text and team_candidates: - # fallback first candidate - team_text = load_markdown_from_file(team_candidates[0]) - if team_text: - weeks.append( - ExampleWeek(date=date, member_reports=member_chunks, team_report=team_text) - ) - return weeks - - -def collect_examples_from_dirs(example_dirs: List[Path]) -> List[ExampleWeek]: - """Collect and merge example weeks from multiple directories. - - If the same date appears across directories, member reports are concatenated - and the first non-empty team report encountered is used. - """ - if not example_dirs: - return [] - merged: Dict[str, ExampleWeek] = {} - for d in example_dirs: - for w in collect_examples(d): - if w.date not in merged: - merged[w.date] = ExampleWeek( - date=w.date, - member_reports=list(w.member_reports), - team_report=w.team_report, - ) - else: - existing = merged[w.date] - # Append member reports, avoid exact duplicates - seen = set(existing.member_reports) - for mr in w.member_reports: - if mr not in seen: - existing.member_reports.append(mr) - seen.add(mr) - if not existing.team_report and w.team_report: - existing.team_report = w.team_report - # Return as list - return list(merged.values()) - - -def build_few_shot_examples(weeks: List[ExampleWeek]) -> str: - if not weeks: - return "" - # Use most recent dates (sorted descending) - weeks_sorted = sorted(weeks, key=lambda w: w.date, reverse=True) - blocks = [] - for w in weeks_sorted: - blocks.append( - f"\n\n{chr(10).join(w.member_reports)}\n\n\n{w.team_report}\n\n" - ) - return "\n\n".join(blocks) - - -def build_prompt(member_markdowns: List[str], examples_block: str) -> str: - intro = ( - "You are an assistant that aggregates individual weekly engineering reports into a concise, well-structured department report. " - "Summarize achievements, ongoing work, issues/risks, metrics, and next week plan. Keep factual, merge duplicates, and preserve important numbers.\n" - ) - instructions = ( - "Format sections based on projects' name." - "Plz try to keep projects consistent between examples and new report." - "'BONY觸控一體機','得鑫螺絲','得鑫螺絲HMI','EBONY觸控一體機'... 都屬於'螺絲案'的一部份。" - "'Resymot', 'Resymot GUI', 'iMotion-XYZ控制器'... 都屬於'iMotion-3dof'的一部份。" - "'育成計畫', '新人訓練'... 都屬於'教育訓練'的一部份。" - "Use bullet points; group similar items." - ) - input_block = "\n\n".join( - f"\n{txt}\n" for i, txt in enumerate(member_markdowns, 1) - ) - prompt = ( - f"{intro}{instructions}\n" - + (f"\nFEW-SHOT EXAMPLES:\n{examples_block}\n" if examples_block else "") - + f"\nTARGET INPUT:\n{input_block}\n\nGenerate the consolidated department weekly report in markdown now." - ) - return prompt - - -async def call_llm(model: str, prompt: str, max_tokens: int = 2000) -> Union[str, None]: - try: - resp: Any = await acompletion( - model=model, - messages=[{"role": "user", "content": prompt}], - max_tokens=max_tokens, - ) - # Support both object-style and dict-style responses - choices: Any = getattr(resp, "choices", None) - if choices is None and isinstance(resp, dict): - choices = resp.get("choices") - if not choices: - return None - first = choices[0] - message: Any = getattr(first, "message", None) - if message is None and isinstance(first, dict): - message = first.get("message") - if not message: - return None - content: Optional[str] = getattr(message, "content", None) - if content is None and isinstance(message, dict): - content = cast(Optional[str], message.get("content")) - return content - except Exception: - print(f"error occurred: {traceback.format_exc()}") - return None - - -def parse_args(argv: Optional[List[str]] = None) -> argparse.Namespace: - p = argparse.ArgumentParser( - description="Generate consolidated weekly department report from individual reports" - ) - p.add_argument( - "--source", required=True, help="Folder containing this week's member reports (docx/md)" - ) - p.add_argument( - "--examples", - "-e", - action="append", - default=[], - help="Folder containing historical example weeks (optional). May be provided multiple times.", - ) - p.add_argument( - "--model", - default="gemini/gemini-1.5-flash", - help="LLM model name for litellm (default: gemini/gemini-1.5-flash)", - ) - p.add_argument("--out", default="department_report.md", help="Output markdown file path") - p.add_argument("--max-tokens", type=int, default=2000, help="Max tokens for generation") - p.add_argument( - "--dry-run", action="store_true", help="Only build and print prompt (no LLM call)" - ) - return p.parse_args(argv) - - -def gather_member_markdowns(source_dir: Path) -> List[str]: - chunks: List[str] = [] - for name, date, path in iter_member_reports(source_dir): - text = load_markdown_from_file(path) - if not text: - continue - chunks.append(f"# {name}\n\n{text}") - return chunks - - -def ensure_api_key_present() -> None: - # litellm supports many providers; we only check a few common env vars. - if any( - os.getenv(k) - for k in [ - "GOOGLE_API_KEY", # Gemini - "GEMINI_API_KEY", # alternate naming if user sets - "OPENAI_API_KEY", - "ANTHROPIC_API_KEY", - "AZURE_OPENAI_API_KEY", - "GROQ_API_KEY", - ] - ): - return - raise RuntimeError( - "No provider API key env var found (e.g., GOOGLE_API_KEY for Gemini). Set one before running." - ) - - -def main(argv: Optional[List[str]] = None) -> None: - load_dotenv() - args = parse_args(argv) - source_dir = Path(args.source) - if not source_dir.exists(): - raise SystemExit(f"Source folder not found: {source_dir}") - examples_dirs = [Path(p) for p in (args.examples or [])] - - member_markdowns = gather_member_markdowns(source_dir) - if not member_markdowns: - raise SystemExit("No valid member reports found in source folder.") - examples = collect_examples_from_dirs(examples_dirs) - examples_block = build_few_shot_examples(examples) - prompt = build_prompt(member_markdowns, examples_block) - - if args.dry_run: - print(prompt) - return - - ensure_api_key_present() - - async def _run(): - report_md = await call_llm(args.model, prompt, max_tokens=args.max_tokens) - out_path = Path(args.out) - if report_md: - out_path.write_text(report_md, encoding="utf-8") - print(f"Report written to {out_path}") - - asyncio.run(_run()) - - -__all__ = [ - "main", - "build_prompt", - "collect_examples", - "gather_member_markdowns", -] diff --git a/src/gen_report/__main__.py b/src/gen_report/__main__.py new file mode 100644 index 0000000..3d401ef --- /dev/null +++ b/src/gen_report/__main__.py @@ -0,0 +1,300 @@ +from __future__ import annotations + +import argparse +import asyncio +import os +import traceback +import re +from dataclasses import dataclass +from pathlib import Path +from typing import Iterable, List, Optional, Any, cast, Union, Dict +from dotenv import load_dotenv + +from litellm import acompletion + +import mammoth # type: ignore + + +MEMBER_REPORT_REGEX = re.compile(r"^(?P.+?)工作報告-(?P\d{8})\.(?Pdocx|doc|md)$") +TEAM_REPORT_HINT = "之工作報告" + + +@dataclass +class ExampleWeek: + date: str + member_reports: List[str] + team_report: str + + +def convert_docx_to_markdown(path: Path) -> str: + """Convert a .docx file to markdown text. + + If mammoth is unavailable or conversion fails, returns an empty string. + """ + if mammoth is None: + return "" + try: + with path.open("rb") as f: + result = mammoth.convert_to_markdown(f) + return result.value.strip() + except Exception: + return "" + + +def load_markdown_from_file(path: Path) -> str: + ext = path.suffix.lower() + if ext == ".md": + try: + return path.read_text(encoding="utf-8").strip() + except Exception: + return "" + if ext in {".docx", ".doc"}: + return convert_docx_to_markdown(path) + return "" + + +def iter_member_reports(folder: Path) -> Iterable[tuple[str, str, Path]]: + for p in folder.glob("*"): + if not p.is_file(): + continue + m = MEMBER_REPORT_REGEX.match(p.name) + if not m: + continue + name = m.group("name") + date = m.group("date") + yield name, date, p + + +def collect_examples(examples_dir: Optional[Path]) -> List[ExampleWeek]: + if not examples_dir or not examples_dir.exists(): + return [] + # Group by date extracted from member reports; then look for a team report file containing hint & date range. + by_date: dict[str, List[str]] = {} + for name, date, path in iter_member_reports(examples_dir): + text = load_markdown_from_file(path) + if not text: + continue + by_date.setdefault(date, []).append(f"# {name}\n\n{text}") + + weeks: List[ExampleWeek] = [] + # Find potential team reports + team_candidates = [ + p for p in examples_dir.glob("*") if p.is_file() and TEAM_REPORT_HINT in p.name + ] + for date, member_chunks in by_date.items(): + team_text = "" + for cand in team_candidates: + # Heuristic: if date substring appears in file name + if date in cand.name: + team_text = load_markdown_from_file(cand) + break + if not team_text and team_candidates: + # fallback first candidate + team_text = load_markdown_from_file(team_candidates[0]) + if team_text: + weeks.append( + ExampleWeek(date=date, member_reports=member_chunks, team_report=team_text) + ) + return weeks + + +def collect_examples_from_dirs(example_dirs: List[Path]) -> List[ExampleWeek]: + """Collect and merge example weeks from multiple directories. + + If the same date appears across directories, member reports are concatenated + and the first non-empty team report encountered is used. + """ + if not example_dirs: + return [] + merged: Dict[str, ExampleWeek] = {} + for d in example_dirs: + for w in collect_examples(d): + if w.date not in merged: + merged[w.date] = ExampleWeek( + date=w.date, + member_reports=list(w.member_reports), + team_report=w.team_report, + ) + else: + existing = merged[w.date] + # Append member reports, avoid exact duplicates + seen = set(existing.member_reports) + for mr in w.member_reports: + if mr not in seen: + existing.member_reports.append(mr) + seen.add(mr) + if not existing.team_report and w.team_report: + existing.team_report = w.team_report + # Return as list + return list(merged.values()) + + +def build_few_shot_examples(weeks: List[ExampleWeek]) -> str: + if not weeks: + return "" + # Use most recent dates (sorted descending) + weeks_sorted = sorted(weeks, key=lambda w: w.date, reverse=True) + blocks = [] + for w in weeks_sorted: + blocks.append( + f"\n\n{chr(10).join(w.member_reports)}\n\n\n{w.team_report}\n\n" + ) + return "\n\n".join(blocks) + + +def build_prompt(member_markdowns: List[str], examples_block: str) -> str: + intro = ( + "You are an assistant that aggregates individual weekly engineering reports into a concise, well-structured department report. " + "Summarize achievements, ongoing work, issues/risks, metrics, and next week plan. Keep factual, merge duplicates, and preserve important numbers.\n" + ) + instructions = ( + "Format sections based on projects' name." + "Plz try to keep projects consistent between examples and new report." + "'BONY觸控一體機','得鑫螺絲','得鑫螺絲HMI','EBONY觸控一體機'... 都屬於'螺絲案'的一部份。" + "'Resymot', 'Resymot GUI', 'iMotion-XYZ控制器'... 都屬於'iMotion-3dof'的一部份。" + "'育成計畫', '新人訓練'... 都屬於'教育訓練'的一部份。" + "Use bullet points; group similar items." + ) + input_block = "\n\n".join( + f"\n{txt}\n" for i, txt in enumerate(member_markdowns, 1) + ) + prompt = ( + f"{intro}{instructions}\n" + + (f"\nFEW-SHOT EXAMPLES:\n{examples_block}\n" if examples_block else "") + + f"\nTARGET INPUT:\n{input_block}\n\nGenerate the consolidated department weekly report in markdown now." + ) + return prompt + + +async def call_llm(model: str, prompt: str, max_tokens: int = 2000) -> Union[str, None]: + try: + # Read temperature from environment variable TEMPERATURE (0.0 - 1.0). + # Fall back to 0.5 if unset or invalid. + def _parse_temperature() -> float: + val = os.getenv("TEMPERATURE") + if val is None: + return 0.5 + try: + t = float(val) + except Exception: + return 0.5 + # Clamp to valid range + if t != t: # NaN guard + return 0.5 + return max(0.0, min(1.0, t)) + + temperature = _parse_temperature() + + resp: Any = await acompletion( + temperature=temperature, + model=model, + messages=[{"role": "user", "content": prompt}], + max_tokens=max_tokens, + ) + # Support both object-style and dict-style responses + choices: Any = getattr(resp, "choices", None) + if choices is None and isinstance(resp, dict): + choices = resp.get("choices") + if not choices: + return None + first = choices[0] + message: Any = getattr(first, "message", None) + if message is None and isinstance(first, dict): + message = first.get("message") + if not message: + return None + content: Optional[str] = getattr(message, "content", None) + if content is None and isinstance(message, dict): + content = cast(Optional[str], message.get("content")) + return content + except Exception: + print(f"error occurred: {traceback.format_exc()}") + return None + + +def parse_args(argv: Optional[List[str]] = None) -> argparse.Namespace: + p = argparse.ArgumentParser( + description="Generate consolidated weekly department report from individual reports" + ) + p.add_argument( + "--source", + "-s", + required=True, + help="Folder containing this week's member reports (docx/md)", + ) + p.add_argument( + "--examples", + "-e", + action="append", + default=[], + help="Folder containing historical example weeks (optional). May be provided multiple times.", + ) + p.add_argument( + "--model", + "-m", + default="gemini/gemini-1.5-flash", + help="LLM model name for litellm (default: gemini/gemini-1.5-flash)", + ) + p.add_argument("--out", "-o", default="department_report.md", help="Output markdown file path") + p.add_argument("--max-tokens", type=int, default=2000, help="Max tokens for generation") + p.add_argument( + "--dry-run", action="store_true", help="Only build and print prompt (no LLM call)" + ) + return p.parse_args(argv) + + +def gather_member_markdowns(source_dir: Path) -> List[str]: + chunks: List[str] = [] + for name, date, path in iter_member_reports(source_dir): + text = load_markdown_from_file(path) + if not text: + continue + chunks.append(f"# {name}\n\n{text}") + return chunks + + +def ensure_api_key_present() -> None: + # litellm supports many providers; we only check a few common env vars. + if any( + os.getenv(k) + for k in [ + "GOOGLE_API_KEY", # Gemini + "GEMINI_API_KEY", # alternate naming if user sets + "OPENAI_API_KEY", + "ANTHROPIC_API_KEY", + "AZURE_OPENAI_API_KEY", + "GROQ_API_KEY", + ] + ): + return + raise RuntimeError( + "No provider API key env var found (e.g., GOOGLE_API_KEY for Gemini). Set one before running." + ) + + +if __name__ == "__main__": + load_dotenv() + args = parse_args() + source_dir = Path(args.source) + if not source_dir.exists(): + raise SystemExit(f"Source folder not found: {source_dir}") + examples_dirs = [Path(p) for p in (args.examples or [])] + + member_markdowns = gather_member_markdowns(source_dir) + if not member_markdowns: + raise SystemExit("No valid member reports found in source folder.") + examples = collect_examples_from_dirs(examples_dirs) + examples_block = build_few_shot_examples(examples) + prompt = build_prompt(member_markdowns, examples_block) + + if args.dry_run: + print(prompt) + + async def _run(): + report_md = await call_llm(args.model, prompt, max_tokens=args.max_tokens) + out_path = Path(args.out) + if report_md: + out_path.write_text(report_md, encoding="utf-8") + print(f"Report written to {out_path}") + + asyncio.run(_run())