movie-night/app/services/prefilter.py

import json
import re

from app.models import Movie

# Mood signal → genre boosts and filters
MOOD_SIGNALS = {
    "kids": {"boost": ["Family", "Animation", "Comedy", "Adventure"], "penalize": ["Horror", "Thriller"], "max_rating": "PG-13"},
    "children": {"boost": ["Family", "Animation", "Comedy", "Adventure"], "penalize": ["Horror", "Thriller"], "max_rating": "PG"},
    "family": {"boost": ["Family", "Animation", "Comedy", "Adventure"], "penalize": ["Horror", "Thriller"], "max_rating": "PG-13"},
    "scary": {"boost": ["Horror", "Thriller", "Mystery"], "penalize": [], "max_rating": None},
    "horror": {"boost": ["Horror", "Thriller"], "penalize": [], "max_rating": None},
    "spooky": {"boost": ["Horror", "Thriller", "Mystery", "Fantasy"], "penalize": [], "max_rating": None},
    "creepy": {"boost": ["Horror", "Thriller", "Mystery"], "penalize": [], "max_rating": None},
    "funny": {"boost": ["Comedy"], "penalize": ["Horror", "War"], "max_rating": None},
    "comedy": {"boost": ["Comedy"], "penalize": [], "max_rating": None},
    "laugh": {"boost": ["Comedy"], "penalize": [], "max_rating": None},
    "light": {"boost": ["Comedy", "Romance", "Animation", "Family"], "penalize": ["Horror", "Thriller", "War"], "max_rating": None},
    "fun": {"boost": ["Comedy", "Adventure", "Animation", "Action"], "penalize": ["Horror", "War"], "max_rating": None},
    "feel-good": {"boost": ["Comedy", "Romance", "Family", "Animation"], "penalize": ["Horror", "Thriller", "War"], "max_rating": None},
    "relax": {"boost": ["Comedy", "Romance", "Drama"], "penalize": ["Horror", "Thriller", "Action"], "max_rating": None},
    "action": {"boost": ["Action", "Adventure", "Sci-Fi", "Thriller"], "penalize": [], "max_rating": None},
    "exciting": {"boost": ["Action", "Adventure", "Thriller"], "penalize": [], "max_rating": None},
    "adventure": {"boost": ["Adventure", "Action", "Fantasy", "Sci-Fi"], "penalize": [], "max_rating": None},
    "intense": {"boost": ["Action", "Thriller", "Drama", "War"], "penalize": [], "max_rating": None},
    "romantic": {"boost": ["Romance", "Comedy", "Drama"], "penalize": ["Horror", "War"], "max_rating": None},
    "romance": {"boost": ["Romance", "Comedy", "Drama"], "penalize": [], "max_rating": None},
    "date night": {"boost": ["Romance", "Comedy", "Drama", "Thriller"], "penalize": [], "max_rating": None},
    "date": {"boost": ["Romance", "Comedy", "Drama"], "penalize": [], "max_rating": None},
    "sad": {"boost": ["Drama", "Romance"], "penalize": ["Comedy", "Animation"], "max_rating": None},
    "cry": {"boost": ["Drama", "Romance", "War"], "penalize": [], "max_rating": None},
    "drama": {"boost": ["Drama"], "penalize": [], "max_rating": None},
    "sci-fi": {"boost": ["Science Fiction", "Sci-Fi", "Fantasy"], "penalize": [], "max_rating": None},
    "space": {"boost": ["Science Fiction", "Sci-Fi"], "penalize": [], "max_rating": None},
    "fantasy": {"boost": ["Fantasy", "Adventure"], "penalize": [], "max_rating": None},
    "mystery": {"boost": ["Mystery", "Thriller", "Crime"], "penalize": [], "max_rating": None},
    "crime": {"boost": ["Crime", "Thriller", "Mystery"], "penalize": [], "max_rating": None},
    "documentary": {"boost": ["Documentary"], "penalize": [], "max_rating": None},
    "war": {"boost": ["War", "History", "Drama"], "penalize": [], "max_rating": None},
    "classic": {"boost": [], "penalize": [], "max_rating": None},
    "animated": {"boost": ["Animation"], "penalize": [], "max_rating": None},
    "anime": {"boost": ["Animation"], "penalize": [], "max_rating": None},
    "music": {"boost": ["Music", "Musical"], "penalize": [], "max_rating": None},
    "musical": {"boost": ["Music", "Musical"], "penalize": [], "max_rating": None},
    "western": {"boost": ["Western"], "penalize": [], "max_rating": None},
    "superhero": {"boost": ["Action", "Adventure", "Science Fiction"], "penalize": [], "max_rating": None},
}

# Content rating hierarchy for family filtering
RATING_ORDER = ["G", "PG", "PG-13", "R", "NC-17", "NR", "Not Rated", None]


def _parse_decade(mood: str) -> tuple[int, int] | None:
    """Extract decade filter from mood text."""
    match = re.search(r"\b(19|20)(\d)0s\b", mood.lower())
    if match:
        decade_start = int(match.group(1) + match.group(2) + "0")
        return (decade_start, decade_start + 9)

    match = re.search(r"\b(old|classic|vintage|retro)\b", mood.lower())
    if match:
        return (1920, 1989)

    return None


def _is_rating_appropriate(content_rating: str | None, max_rating: str | None) -> bool:
    """Check if a movie's content rating is at or below the max allowed."""
    if max_rating is None:
        return True
    if content_rating is None:
        return True  # Unknown rating, let it through

    try:
        movie_idx = RATING_ORDER.index(content_rating)
        max_idx = RATING_ORDER.index(max_rating)
        return movie_idx <= max_idx
    except ValueError:
        return True  # Unknown rating format, let it through


def _parse_movie(raw: dict) -> Movie:
    """Convert a raw DB row dict into a Movie model."""
    return Movie(
        jellyfin_id=raw["jellyfin_id"],
        title=raw["title"],
        sort_title=raw.get("sort_title"),
        year=raw.get("year"),
        genres=json.loads(raw.get("genres") or "[]"),
        overview=raw.get("overview"),
        community_rating=raw.get("community_rating"),
        critic_rating=raw.get("critic_rating"),
        runtime_minutes=raw.get("runtime_minutes"),
        content_rating=raw.get("content_rating"),
        studios=json.loads(raw.get("studios") or "[]"),
        people=json.loads(raw.get("people") or "[]"),
        tags=json.loads(raw.get("tags") or "[]"),
    )


def prefilter_candidates(movies_raw: list[dict], mood: str, max_candidates: int = 200) -> list[Movie]:
    """Score and filter movies based on mood signals. Returns top candidates as Movie models."""
    mood_lower = mood.lower()

    # Collect all active signals
    boost_genres: set[str] = set()
    penalize_genres: set[str] = set()
    max_rating: str | None = None
    decade = _parse_decade(mood)

    for keyword, signals in MOOD_SIGNALS.items():
        if keyword in mood_lower:
            boost_genres.update(signals["boost"])
            penalize_genres.update(signals["penalize"])
            if signals["max_rating"] and (max_rating is None or RATING_ORDER.index(signals["max_rating"]) < RATING_ORDER.index(max_rating)):
                max_rating = signals["max_rating"]

    # Remove any genres that appear in both boost and penalize
    penalize_genres -= boost_genres

    scored: list[tuple[float, dict]] = []

    for raw in movies_raw:
        movie_genres = set(json.loads(raw.get("genres") or "[]"))
        content_rating = raw.get("content_rating")

        # Filter by content rating
        if not _is_rating_appropriate(content_rating, max_rating):
            continue

        # Filter by decade
        year = raw.get("year")
        if decade and year:
            if year < decade[0] or year > decade[1]:
                continue

        # Score the movie
        score = 0.0

        # Genre match bonus
        if boost_genres:
            genre_overlap = len(movie_genres & boost_genres)
            score += genre_overlap * 3.0

        # Genre penalty
        if penalize_genres:
            penalty_overlap = len(movie_genres & penalize_genres)
            score -= penalty_overlap * 2.0

        # Rating bonus (higher rated movies get a small boost)
        rating = raw.get("community_rating")
        if rating:
            score += rating * 0.3

        # Keyword match in overview
        overview = (raw.get("overview") or "").lower()
        mood_words = [w for w in mood_lower.split() if len(w) > 3]
        for word in mood_words:
            if word in overview:
                score += 1.0

        scored.append((score, raw))

    # Sort by score descending
    scored.sort(key=lambda x: x[0], reverse=True)

    # Return top candidates as Movie models
    return [_parse_movie(raw) for _, raw in scored[:max_candidates]]