from dataclasses import dataclass
from datetime import date
from pathlib import Path
import re

try:
    from pypdf import PdfReader
except ModuleNotFoundError:
    PdfReader = None


@dataclass
class ParsedExcursion:
    title: str
    summary: str
    excursion_date: date | None
    start_time_text: str | None
    start_temperature_c: float | None
    car_distance_km: float | None
    min_altitude_m: int | None
    max_altitude_m: int | None
    has_text_layer: bool
    page_count_estimated: int
    raw_text_excerpt: str
    extraction_status: str


FILENAME_RE = re.compile(r"^(?P<yy>\d{2})\.(?P<mm>\d{2})\.(?P<dd>\d{2})\s*-\s*(?P<title>.+)$")
ROUTE_KEYWORDS_RE = re.compile(
    r"(dist[aà]ncia|desnivell|durada|hores?|itinerari|sortida|arribada|ruta|recorregut|dificultat|altitud|cota|km)",
    re.IGNORECASE,
)


def _parse_date_and_title_from_filename(pdf_path: Path) -> tuple[date | None, str]:
    stem = pdf_path.stem.strip()
    match = FILENAME_RE.match(stem)
    if not match:
        return None, stem

    yy = int(match.group("yy"))
    mm = int(match.group("mm"))
    dd = int(match.group("dd"))
    title = match.group("title").strip().rstrip(".")

    try:
        parsed_date = date(2000 + yy, mm, dd)
    except ValueError:
        parsed_date = None

    return parsed_date, title


def _estimate_page_count(raw_pdf: bytes) -> int:
    return len(re.findall(rb"/Type\s*/Page\b", raw_pdf))


def _detect_text_layer(raw_pdf: bytes) -> bool:
    return b"BT" in raw_pdf and b"ET" in raw_pdf and b"/Font" in raw_pdf


def _extract_text_with_pypdf(pdf_path: Path) -> tuple[int, str]:
    if PdfReader is None:
        return 0, ""

    reader = PdfReader(str(pdf_path))
    page_count = len(reader.pages)
    text_parts = []
    for page in reader.pages:
        text = (page.extract_text() or "").strip()
        if text:
            text_parts.append(text)

    return page_count, "\n".join(text_parts)


def _clean_text(text: str) -> str:
    text = text.replace("\u00a0", " ")
    text = re.sub(r"\s+", " ", text)
    return text.strip()


def _parse_start_time_text(text: str) -> str | None:
    time_expr = r"(?:a\s+les\s+\d{1,2}(?::\d{2})?|(?:un|dos|tres)\s+quarts\s+de\s+[A-Za-zÀ-ÿ]+|prop\s+de\s+les\s+\d{1,2}|abans\s+de\s+les\s+\d{1,2})"
    patterns = (
        rf"(?:Marxem|Sortim)\s+de\s+Sabadell\s+({time_expr})",
        rf"Comencem\s+a\s+caminar\s+({time_expr})",
    )
    for pattern in patterns:
        match = re.search(pattern, text, flags=re.IGNORECASE)
        if match:
            value = _clean_text(match.group(1))
            value = re.sub(r"\s+", " ", value)
            return value
    return None


def _parse_temperature_c(text: str) -> float | None:
    match = re.search(r"temperatura\s*(?:de\s*)?(\d+(?:[\.,]\d+)?)\s*º", text, flags=re.IGNORECASE)
    if not match:
        return None
    value = match.group(1).replace(",", ".")
    try:
        return float(value)
    except ValueError:
        return None


def _parse_car_distance_km(text: str) -> float | None:
    # Typically appears at the end: "Cotxe(s) ... Distància 125 Km"
    candidates = re.findall(r"Dist[aà]ncia\s*(\d+(?:[\.,]\d+)?)\s*Km", text, flags=re.IGNORECASE)
    if not candidates:
        return None
    try:
        return float(candidates[-1].replace(",", "."))
    except ValueError:
        return None


def _parse_altitude_bounds_m(text: str) -> tuple[int | None, int | None]:
    altitudes = re.findall(r"\(alt\.\s*(\d{2,4})\s*m\)", text, flags=re.IGNORECASE)
    if not altitudes:
        return None, None
    values = []
    for raw in altitudes:
        try:
            values.append(int(raw))
        except ValueError:
            continue
    if not values:
        return None, None
    return min(values), max(values)


def _extract_human_text_excerpt(raw_pdf: bytes, max_len: int = 1000) -> str:
    runs = re.findall(rb"[\x20-\x7E]{8,}", raw_pdf)
    cleaned = []
    skip_tokens = (
        "%PDF",
        " obj",
        "endobj",
        "stream",
        "endstream",
        "/Type",
        "/Pages",
        "/Font",
        "/XObject",
        "/MediaBox",
    )

    def is_humanish(text: str) -> bool:
        if len(text) < 20:
            return False

        invalid_chars = len(re.findall(r"[^A-Za-z0-9À-ÿ .,;:!?()'\"/\-]", text))
        if invalid_chars / max(len(text), 1) > 0.08:
            return False

        words = re.findall(r"[A-Za-zÀ-ÿ]{2,}", text)
        if len(words) < 3:
            return False

        letters = "".join(words).lower()
        vowel_count = sum(1 for c in letters if c in "aeiouàèéíïòóúü")
        if len(letters) > 0 and (vowel_count / len(letters)) < 0.22:
            return False

        return True

    for run in runs:
        text = run.decode("latin-1", errors="ignore").strip()
        if not text:
            continue
        if any(token in text for token in skip_tokens):
            continue
        if not re.search(r"[A-Za-z]{3}", text):
            continue
        if not is_humanish(text):
            continue
        cleaned.append(text)
        if len(" ".join(cleaned)) > max_len:
            break

    excerpt = " ".join(cleaned)[:max_len]
    return excerpt


def parse_pdf(pdf_path: Path) -> ParsedExcursion:
    raw_pdf = pdf_path.read_bytes()
    excursion_date, title = _parse_date_and_title_from_filename(pdf_path)
    estimated_pages = _estimate_page_count(raw_pdf)
    has_text_layer = _detect_text_layer(raw_pdf)

    pypdf_pages, full_text = _extract_text_with_pypdf(pdf_path)
    clean_text = _clean_text(full_text)

    if clean_text:
        excerpt = clean_text[:2500]
        page_count = pypdf_pages or estimated_pages
        if ROUTE_KEYWORDS_RE.search(clean_text):
            extraction_status = "parsed_text"
        else:
            extraction_status = "metadata_only"

        start_time_text = _parse_start_time_text(clean_text)
        start_temperature_c = _parse_temperature_c(clean_text)
        car_distance_km = _parse_car_distance_km(clean_text)
        min_altitude_m, max_altitude_m = _parse_altitude_bounds_m(clean_text)
    else:
        excerpt = _extract_human_text_excerpt(raw_pdf)
        page_count = estimated_pages
        start_time_text = None
        start_temperature_c = None
        car_distance_km = None
        min_altitude_m = None
        max_altitude_m = None

        if len(excerpt) < 80:
            extraction_status = "ocr_needed"
        elif ROUTE_KEYWORDS_RE.search(excerpt):
            extraction_status = "metadata_only"
        else:
            extraction_status = "ocr_needed"

    summary = excerpt[:1000] if excerpt else "No text excerpt available."

    return ParsedExcursion(
        title=title,
        summary=summary,
        excursion_date=excursion_date,
        start_time_text=start_time_text,
        start_temperature_c=start_temperature_c,
        car_distance_km=car_distance_km,
        min_altitude_m=min_altitude_m,
        max_altitude_m=max_altitude_m,
        has_text_layer=has_text_layer,
        page_count_estimated=page_count,
        raw_text_excerpt=excerpt,
        extraction_status=extraction_status,
    )
