#!/usr/bin/env python3
"""Attach Russian search tags to cached events and build a tag navigation map."""

from __future__ import annotations

import json
import re
import time
from collections import Counter
from dataclasses import dataclass
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, Iterable, List, Sequence

from deep_translator import GoogleTranslator

BASE_STOPWORDS = {
    "and",
    "the",
    "for",
    "with",
    "from",
    "that",
    "this",
    "will",
    "are",
    "was",
    "were",
    "his",
    "her",
    "has",
    "have",
    "had",
    "but",
    "all",
    "any",
    "per",
    "not",
    "one",
    "two",
    "three",
    "can",
    "more",
    "most",
    "each",
    "our",
    "their",
    "your",
    "also",
    "its",
    "who",
    "she",
    "him",
    "them",
    "you",
    "they",
    "within",
    "onto",
    "into",
    "over",
    "such",
    "than",
    "out",
    "via",
    "off",
    "make",
    "made",
    "being",
    "around",
    "some",
    "very",
    "may",
    "only",
    "among",
    "upon",
    "others",
    "both",
    "between",
    "every",
    "same",
    "well",
    "like",
    "aims",
    "help",
    "along",
    "makes",
    "give",
    "been",
    "after",
    "before",
    "across",
    "through",
    "much",
    "able",
    "just",
    "use",
    "used",
    "offering",
    "offers",
    "bring",
    "brings",
    "provide",
    "provides",
    "providing",
    "including",
    "includes",
    "include",
    "featuring",
    "feature",
    "features",
    "where",
    "while",
    "again",
    "other",
    "many",
    "take",
    "takes",
    "taking",
    "under",
    "overseas",
    "new",
    "latest",
    "as",
    "is",
    "an",
    "it",
    "on",
    "to",
    "of",
    "in",
    "by",
    "at",
    "be",
    "go",
    "non",
    "top",
    "since",
    "about",
    "held",
    "plus",
    "etc",
    "per",
    "meet",
    "meetings",
    "covering",
    "cover",
    "coverings",
    "together",
    "various",
    "variety",
    "range",
    "ranges",
    "complete",
    "entire",
    "whole",
    "line",
    "lines",
    "site",
    "sites",
    "online",
    "detail",
    "details",
    "official",
    "organizer",
    "edition",
    "annual",
    "cycle",
    "largest",
    "biggest",
    "opening",
    "expected",
    "important",
    "future",
    "enter",
    "entering",
    "enterprises",
    "exhibitors",
    "exhibits",
    "gathering",
    "gatherings",
    "things",
    "bringing",
    "bright",
    "functional",
    "focuses",
    "opportunities",
    "worldwide",
    "point",
    "promote",
    "related",
    "applications",
    "masters",
    "grandeur",
    "experts",
    "advanced",
    "high-end",
    "high-quality",
    "cutting-edge",
    "one-stop",
}

GEO_STOPWORDS = {
    "asia",
    "asian",
    "china",
    "chinese",
    "shanghai",
    "guangzhou",
    "beijing",
    "shenzhen",
    "chengdu",
    "wenzhou",
    "nanning",
    "nanjing",
    "xiamen",
    "yiwu",
    "qingdao",
    "tianjin",
    "foshan",
    "chongqing",
    "hangzhou",
    "dongguan",
    "changsha",
    "ningbo",
    "sanya",
    "zhuhai",
    "shenyang",
    "wuhan",
    "kunming",
    "nanchang",
    "ganzhou",
    "hainan",
    "guangxi",
    "guangdong",
    "henan",
    "hubei",
    "anhui",
    "fujian",
    "jiangsu",
    "jiangxi",
    "liaoning",
    "sichuan",
    "xinjiang",
    "zhejiang",
    "harbin",
    "taipei",
    "pazhou",
    "pudong",
    "chaoyang",
    "haizhu",
    "longyang",
    "yuejiang",
    "xingang",
    "poly",
    "district",
    "center",
    "centre",
    "hall",
    "city",
    "road",
    "avenue",
    "avenues",
    "street",
    "streets",
    "park",
    "parks",
    "complex",
    "floor",
    "west",
    "east",
    "north",
    "south",
    "middle",
    "central",
    "zone",
    "zones",
    "downtown",
    "area",
    "areas",
    "town",
    "region",
    "regions",
    "pacific",
}

GENERIC_STOPWORDS = {
    "industrial",
    "industry",
    "industries",
    "world",
    "global",
    "international",
    "expo",
    "exhibition",
    "exhibitions",
    "fair",
    "fairs",
    "show",
    "shows",
    "showcase",
    "showcases",
    "event",
    "events",
    "trade",
    "platform",
    "leading",
    "conference",
    "conferences",
    "forum",
    "summit",
    "series",
    "professional",
    "professionals",
    "dedicated",
    "comprehensive",
    "innovative",
    "innovation",
    "innovations",
    "premier",
    "brand",
    "brands",
    "buyers",
    "supplier",
    "suppliers",
    "manufacturer",
    "manufacturers",
    "product",
    "products",
    "solutions",
    "solution",
    "services",
    "service",
    "businesses",
    "market",
    "markets",
    "sector",
    "sectors",
    "focused",
    "focusing",
    "focus",
    "special",
    "specialized",
    "extensive",
    "extensively",
    "session",
    "sessions",
    "spring",
    "autumn",
    "summer",
    "winter",
    "season",
    "seasons",
    "year",
    "years",
    "day",
    "days",
    "week",
    "weeks",
    "month",
    "months",
    "source",
    "slug",
    "label",
    "status",
    "completed",
    "detail",
    "detail_url",
    "official_site",
    "event_email",
    "date",
    "date_range",
    "generated",
    "hosted",
    "hosting",
    "resources",
    "resource",
    "supply",
    "home",
    "chain",
    "sourcing",
    "development",
    "supply-chain",
}

EVENT_BRANDS = {
    "drt",
    "gwpf",
    "gwpe",
    "hotelex",
    "cphi",
    "fai",
    "mba",
    "hde",
    "pes",
    "tct",
    "b2b",
    "mbs",
    "mg",
    "expoe",
}

STOPWORDS = BASE_STOPWORDS | GEO_STOPWORDS | GENERIC_STOPWORDS | EVENT_BRANDS

TOKEN_RE = re.compile(r"[A-Za-z0-9&+-]{2,}")
TOKEN_SPLIT_RE = re.compile(r"[+&/|\-]+")
WORD_CHUNK_RE = re.compile(r"[A-Za-z\u0400-\u04FF0-9]+")
MIN_FREQUENCY = 5
MAX_TAGS_PER_EVENT = 5
FALLBACK_TAG_COUNT = 3
DEFAULT_FALLBACK_TERM = "misc"
THROTTLE_SECONDS = 0.05
SLUG_SANITIZE_RE = re.compile(r"[^a-z0-9]+")


@dataclass
class MonthPayload:
    path: Path
    data: Dict[str, object]

    @property
    def slug(self) -> str:
        value = self.data.get("slug")
        return str(value) if isinstance(value, str) and value else self.path.stem

    @property
    def label(self) -> str:
        value = self.data.get("label")
        return str(value) if isinstance(value, str) and value else self.slug

    @property
    def events(self) -> List[Dict[str, object]]:
        events = self.data.get("events")
        if isinstance(events, list):
            return events
        events_list: List[Dict[str, object]] = []
        self.data["events"] = events_list
        return events_list


@dataclass
class EventRecord:
    month: MonthPayload
    event_index: int
    position: int
    tokens: List[str]

    @property
    def event(self) -> Dict[str, object]:
        return self.month.events[self.event_index]


def iter_month_files(month_dir: Path) -> Iterable[Path]:
    for path in sorted(month_dir.glob("*.json")):
        if path.is_file():
            yield path


def iter_event_text(event: Dict[str, object]) -> Iterable[str]:
    for field in ("name", "description"):
        value = event.get(field)
        if isinstance(value, str):
            yield value


def tokenize_event(event: Dict[str, object]) -> List[str]:
    tokens: List[str] = []
    for text in iter_event_text(event):
        for raw_token in TOKEN_RE.findall(text):
            token = raw_token.lower()
            if token in STOPWORDS:
                continue
            if not any(ch.isalpha() for ch in token):
                continue
            for part in split_compound_token(token):
                if part in STOPWORDS:
                    continue
                if not any(ch.isalpha() for ch in part):
                    continue
                tokens.append(part)
    return tokens


def split_compound_token(token: str) -> List[str]:
    parts = [part for part in TOKEN_SPLIT_RE.split(token) if part]
    return parts or [token]


def contains_cyrillic(text: str) -> bool:
    for char in text:
        if "\u0400" <= char <= "\u04FF":
            return True
    return False


def simplify_label(raw_label: str, fallback_term: str) -> str:
    normalized = raw_label.strip().lower()
    fallback_chunk = ""
    for match in WORD_CHUNK_RE.finditer(normalized):
        chunk = match.group(0)
        if contains_cyrillic(chunk):
            return chunk
        if not fallback_chunk:
            fallback_chunk = chunk
    if fallback_chunk:
        return fallback_chunk
    sanitized = normalized.replace(" ", "").replace("-", "")
    if sanitized:
        return sanitized
    return fallback_term.lower()


def resolve_position(event: Dict[str, object], fallback_index: int) -> int:
    value = event.get("position")
    if isinstance(value, int):
        return value
    if isinstance(value, str):
        digits = "".join(ch for ch in value if ch.isdigit())
        if digits:
            try:
                return int(digits)
            except ValueError:
                pass
    return fallback_index + 1


def load_month_payloads(month_dir: Path) -> List[MonthPayload]:
    payloads: List[MonthPayload] = []
    for path in iter_month_files(month_dir):
        data = json.loads(path.read_text(encoding="utf-8"))
        payloads.append(MonthPayload(path=path, data=data))
    return payloads


def collect_event_records(months: Sequence[MonthPayload]) -> tuple[List[EventRecord], Counter]:
    records: List[EventRecord] = []
    counter: Counter = Counter()
    for month in months:
        for index, event in enumerate(month.events):
            tokens = tokenize_event(event)
            if tokens:
                counter.update(tokens)
            position = resolve_position(event, index)
            records.append(
                EventRecord(
                    month=month,
                    event_index=index,
                    position=position,
                    tokens=tokens,
                )
            )
    return records, counter


def select_event_terms(tokens: Sequence[str], allowed_terms: set[str], counter: Counter) -> List[str]:
    if not tokens:
        return [DEFAULT_FALLBACK_TERM]
    seen = set()
    unique_tokens: List[str] = []
    for term in tokens:
        if term in seen:
            continue
        seen.add(term)
        unique_tokens.append(term)
    ordered = sorted(unique_tokens, key=lambda term: (-counter[term], term))
    selected = [term for term in ordered if term in allowed_terms][:MAX_TAGS_PER_EVENT]
    if not selected:
        limit = max(1, min(FALLBACK_TAG_COUNT, len(ordered)))
        selected = ordered[:limit]
    return selected or [DEFAULT_FALLBACK_TERM]


def slugify_term(term: str) -> str:
    slug = SLUG_SANITIZE_RE.sub("-", term.lower()).strip("-")
    return slug or term.lower()


def translate_term_cached(
    translator: GoogleTranslator,
    cache: Dict[str, str],
    term: str,
) -> str:
    if term in cache:
        return cache[term]
    try:
        raw = translator.translate(term).strip().lower()
    except Exception:
        raw = term
    ru_value = simplify_label(raw, term)
    cache[term] = ru_value
    time.sleep(THROTTLE_SECONDS)
    return ru_value


def build_event_reference(record: EventRecord, event_id: str) -> Dict[str, object]:
    event = record.event
    date_value = event.get("date_range") or event.get("date") or ""
    return {
        "event_id": event_id,
        "month": record.month.slug,
        "month_label": record.month.label,
        "month_file": record.month.path.name,
        "position": record.position,
        "name": event.get("name", ""),
        "name_ru": event.get("name_ru", ""),
        "date_range": date_value,
        "detail_url": event.get("detail_url", ""),
        "geo_text": event.get("geo_text", ""),
    }


def apply_tags_to_events(
    records: Sequence[EventRecord],
    allowed_terms: set[str],
    counter: Counter,
    translator: GoogleTranslator,
) -> Dict[str, Dict[str, object]]:
    tag_index: Dict[str, Dict[str, object]] = {}
    translation_cache: Dict[str, str] = {}
    for record in records:
        terms = select_event_terms(record.tokens, allowed_terms, counter)
        search_tags: List[Dict[str, str]] = []
        event = record.event
        event_id = f"{record.month.slug}#{record.position}"
        for term in terms:
            slug = slugify_term(term)
            label = translate_term_cached(translator, translation_cache, term)
            search_tags.append({"slug": slug, "label": label})
            entry = tag_index.setdefault(
                slug,
                {
                    "slug": slug,
                    "token": term,
                    "label": label,
                    "events": [],
                },
            )
            entry["events"].append(build_event_reference(record, event_id))
        search_tags.sort(key=lambda item: item["label"])
        event["search_tags"] = search_tags
    return tag_index


def write_month_payloads(months: Sequence[MonthPayload]) -> None:
    for month in months:
        serialized = json.dumps(month.data, ensure_ascii=False, indent=4)
        month.path.write_text(f"{serialized}\n", encoding="utf-8")


def build_tag_payload(
    tag_index: Dict[str, Dict[str, object]],
    month_dir: Path,
    total_events: int,
) -> Dict[str, object]:
    tag_records = []
    for entry in tag_index.values():
        events = sorted(
            entry["events"],
            key=lambda item: (item["month"], item["position"], item["event_id"]),
        )
        tag_records.append(
            {
                "slug": entry["slug"],
                "label": entry["label"],
                "token": entry["token"],
                "events_count": len(events),
                "events": events,
            }
        )
    tag_records.sort(key=lambda item: item["label"])
    return {
        "generated_at": datetime.now(timezone.utc).isoformat(),
        "min_frequency": MIN_FREQUENCY,
        "max_tags_per_event": MAX_TAGS_PER_EVENT,
        "fallback_tag_count": FALLBACK_TAG_COUNT,
        "source_dir": str(month_dir),
        "total_tags": len(tag_records),
        "total_events_indexed": total_events,
        "tags": tag_records,
    }


def main() -> None:
    month_dir = Path("cache/months")
    if not month_dir.exists():
        raise SystemExit("cache/months directory is missing")

    months = load_month_payloads(month_dir)
    if not months:
        raise SystemExit("No month JSON files were found in cache/months")

    records, counter = collect_event_records(months)
    allowed_terms = {term for term, freq in counter.items() if freq >= MIN_FREQUENCY}

    translator = GoogleTranslator(source="en", target="ru")
    tag_index = apply_tags_to_events(records, allowed_terms, counter, translator)

    write_month_payloads(months)
    payload = build_tag_payload(tag_index, month_dir, len(records))

    output_path = Path("tags.json")
    output_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
    print(f"Updated {len(records)} events across {len(months)} month files")
    print(f"Saved {payload['total_tags']} grouped tags to {output_path}")


if __name__ == "__main__":
    main()