eddie-soehnel-portable-iden.../scripts/insights-hub-posts-last-6-months.py

#!/usr/bin/env python3
"""
Publish Insights Hub records from the Personal Organization JSON database.

What it does:
1. Scans source JSON records.
2. Selects records where HubTags contains "External Platform Posts".
3. Copies matching JSON files into the Insights Hub hrecords folder.
4. Copies referenced File/Image assets into the Insights Hub files folder.
5. Builds a Markdown digest for records dated within the last 6 months,
   sorted most recent first.

Run manually weekly:
    python insights-hub-posts-last-6-months.py
"""

from __future__ import annotations

import json
import shutil
from datetime import datetime, date
from pathlib import Path
from typing import Any, Iterable


# -----------------------------
# CONFIG
# -----------------------------

SOURCE_HRECORDS_DIR = Path(
    r"C:\Users\edsoe\My Drive\Personal Organization\JSON_Database\hrecords"
)

SOURCE_FILES_DIR = Path(
    r"C:\Users\edsoe\My Drive\Personal Organization\JSON_Database\files"
)

TARGET_BASE_DIR = Path(
    r"C:\projects\ES\eddie-soehnel-portable-identity-document-OPEN\data\insights-hub"
)

TARGET_HRECORDS_DIR = TARGET_BASE_DIR / "hrecords"
TARGET_FILES_DIR = TARGET_BASE_DIR / "files"

# Change this filename if you prefer a different published Markdown name.
OUTPUT_MD_FILE = TARGET_BASE_DIR / "insights-hub-posts-last-6-months.md"

REQUIRED_TAG = "External Platform Posts"

# Approximate "last 6 months" as 183 days. This avoids requiring extra packages.
LAST_N_DAYS = 183


# -----------------------------
# HELPERS
# -----------------------------

def parse_record_date(value: Any) -> date | None:
    """
    Parse common date formats found in hrecord JSON files.
    Example attached file uses MM/DD/YYYY, e.g. 06/12/2026.
    """
    if not value:
        return None

    text = str(value).strip()

    formats = [
        "%m/%d/%Y",
        "%Y-%m-%d",
        "%Y/%m/%d",
        "%m-%d-%Y",
        "%B %d, %Y",
        "%b %d, %Y",
    ]

    for fmt in formats:
        try:
            return datetime.strptime(text, fmt).date()
        except ValueError:
            pass

    return None


def normalize_to_list(value: Any) -> list[str]:
    """
    Accepts strings, lists, or empty values and returns a clean list of strings.
    """
    if value is None:
        return []

    if isinstance(value, list):
        return [str(item).strip() for item in value if str(item).strip()]

    if isinstance(value, str):
        text = value.strip()
        return [text] if text else []

    return [str(value).strip()] if str(value).strip() else []


def has_required_tag(record: dict[str, Any]) -> bool:
    tags = normalize_to_list(record.get("HubTags"))
    return REQUIRED_TAG in tags


def safe_asset_names(record: dict[str, Any]) -> list[str]:
    """
    Pull file/image references from the JSON.

    Supports:
    - "File": "filename.pdf"
    - "Image": "image.jpg"
    - "File": ["one.pdf", "two.pdf"]
    - "Image": ["one.jpg", "two.jpg"]

    Only local filenames/relative paths are copied. URLs are ignored.
    """
    names: list[str] = []

    for key in ("File", "Image"):
        for item in normalize_to_list(record.get(key)):
            if item.startswith(("http://", "https://")):
                continue
            names.append(item)

    # De-duplicate while preserving order.
    seen = set()
    cleaned = []
    for name in names:
        if name not in seen:
            cleaned.append(name)
            seen.add(name)

    return cleaned


def copy_file_if_exists(source: Path, target: Path) -> bool:
    target.parent.mkdir(parents=True, exist_ok=True)

    if not source.exists():
        return False

    shutil.copy2(source, target)
    return True


def load_json_file(path: Path) -> dict[str, Any] | None:
    try:
        with path.open("r", encoding="utf-8-sig") as f:
            data = json.load(f)
        if isinstance(data, dict):
            return data
        print(f"SKIP non-object JSON: {path}")
        return None
    except Exception as exc:
        print(f"SKIP unreadable JSON: {path} | {exc}")
        return None


def markdown_escape(text: Any) -> str:
    """
    Keep this light so links and normal Markdown in summaries still work.
    """
    if text is None:
        return ""
    return str(text).strip()


def build_image_markdown(record: dict[str, Any]) -> str:
    lines = []

    for image_name in normalize_to_list(record.get("Image")):
        if image_name.startswith(("http://", "https://")):
            image_src = image_name
        else:
            # The Markdown file lives in TARGET_BASE_DIR, and copied images live
            # in TARGET_BASE_DIR/files, so this relative path should render correctly.
            image_src = f"files/{Path(image_name).name}"

        alt = markdown_escape(record.get("Title")) or Path(image_name).stem
        lines.append(f"![{alt}]({image_src})")

    return "\n\n".join(lines)


def iter_json_files(folder: Path) -> Iterable[Path]:
    yield from folder.glob("*.json")


# -----------------------------
# MAIN
# -----------------------------

def main() -> None:
    TARGET_HRECORDS_DIR.mkdir(parents=True, exist_ok=True)
    TARGET_FILES_DIR.mkdir(parents=True, exist_ok=True)

    if not SOURCE_HRECORDS_DIR.exists():
        raise FileNotFoundError(f"Source hrecords folder not found: {SOURCE_HRECORDS_DIR}")

    if not SOURCE_FILES_DIR.exists():
        print(f"WARNING: Source files folder not found: {SOURCE_FILES_DIR}")
        print("JSON files will still be copied, but image/file assets cannot be copied.")

    today = date.today()
    cutoff = today.toordinal() - LAST_N_DAYS

    matched_records: list[dict[str, Any]] = []
    copied_json_count = 0
    copied_asset_count = 0
    missing_asset_count = 0

    for json_path in iter_json_files(SOURCE_HRECORDS_DIR):
        record = load_json_file(json_path)
        if not record:
            continue

        if not has_required_tag(record):
            continue

        record_date = parse_record_date(record.get("Date"))
        record["_source_json_path"] = str(json_path)
        record["_json_filename"] = json_path.name
        record["_parsed_date"] = record_date

        matched_records.append(record)

        # Copy JSON record.
        if copy_file_if_exists(json_path, TARGET_HRECORDS_DIR / json_path.name):
            copied_json_count += 1

        # Copy referenced files/images.
        for asset_name in safe_asset_names(record):
            source_asset = SOURCE_FILES_DIR / asset_name
            target_asset = TARGET_FILES_DIR / Path(asset_name).name

            if copy_file_if_exists(source_asset, target_asset):
                copied_asset_count += 1
            else:
                missing_asset_count += 1
                print(f"MISSING asset referenced by {json_path.name}: {source_asset}")

    # Last 6 months only for the Markdown digest.
    recent_records = [
        r for r in matched_records
        if r.get("_parsed_date") is not None
        and r["_parsed_date"].toordinal() >= cutoff
    ]

    # Most recent first.
    recent_records.sort(
        key=lambda r: (
            r.get("_parsed_date") or date.min,
            str(r.get("HubID", "")),
        ),
        reverse=True,
    )

    md_lines = [
        "# Insights Hub Posts - Last 6 Months",
        "",
        f"Generated: {today.isoformat()}",
        "",
        f"Source tag: `{REQUIRED_TAG}`",
        "",
        "---",
        "",
    ]

    for record in recent_records:
        title = markdown_escape(record.get("Title")) or "Untitled"
        summary = markdown_escape(record.get("Summary"))
        record_date = record.get("_parsed_date")
        date_text = record_date.isoformat() if record_date else markdown_escape(record.get("Date"))

        md_lines.append(f"## {title}")
        md_lines.append("")
        if date_text:
            md_lines.append(f"**Date:** {date_text}")
            md_lines.append("")
        if summary:
            md_lines.append(summary)
            md_lines.append("")

        image_md = build_image_markdown(record)
        if image_md:
            md_lines.append(image_md)
            md_lines.append("")

        md_lines.append("---")
        md_lines.append("")

    OUTPUT_MD_FILE.parent.mkdir(parents=True, exist_ok=True)
    OUTPUT_MD_FILE.write_text("\n".join(md_lines), encoding="utf-8")

    print("DONE")
    print(f"Matched records: {len(matched_records)}")
    print(f"Records in 6-month Markdown: {len(recent_records)}")
    print(f"Copied JSON files: {copied_json_count}")
    print(f"Copied assets: {copied_asset_count}")
    print(f"Missing assets: {missing_asset_count}")
    print(f"Markdown output: {OUTPUT_MD_FILE}")


if __name__ == "__main__":
    main()