app/services/prefilter.py

import json
import re

from app.models import Movie

# Mood signal → genre boosts and filters
MOOD_SIGNALS = {
    "kids": {"boost": ["Family", "Animation", "Comedy", "Adventure"], "penalize": ["Horror", "Thriller"], "max_rating": "PG-13"},
    "children": {"boost": ["Family", "Animation", "Comedy", "Adventure"], "penalize": ["Horror", "Thriller"], "max_rating": "PG"},
    "family": {"boost": ["Family", "Animation", "Comedy", "Adventure"], "penalize": ["Horror", "Thriller"], "max_rating": "PG-13"},
    "scary": {"boost": ["Horror", "Thriller", "Mystery"], "penalize": [], "max_rating": None},
    "horror": {"boost": ["Horror", "Thriller"], "penalize": [], "max_rating": None},
    "spooky": {"boost": ["Horror", "Thriller", "Mystery", "Fantasy"], "penalize": [], "max_rating": None},
    "creepy": {"boost": ["Horror", "Thriller", "Mystery"], "penalize": [], "max_rating": None},
    "funny": {"boost": ["Comedy"], "penalize": ["Horror", "War"], "max_rating": None},
    "comedy": {"boost": ["Comedy"], "penalize": [], "max_rating": None},
    "laugh": {"boost": ["Comedy"], "penalize": [], "max_rating": None},
    "light": {"boost": ["Comedy", "Romance", "Animation", "Family"], "penalize": ["Horror", "Thriller", "War"], "max_rating": None},
    "fun": {"boost": ["Comedy", "Adventure", "Animation", "Action"], "penalize": ["Horror", "War"], "max_rating": None},
    "feel-good": {"boost": ["Comedy", "Romance", "Family", "Animation"], "penalize": ["Horror", "Thriller", "War"], "max_rating": None},
    "relax": {"boost": ["Comedy", "Romance", "Drama"], "penalize": ["Horror", "Thriller", "Action"], "max_rating": None},
    "action": {"boost": ["Action", "Adventure", "Sci-Fi", "Thriller"], "penalize": [], "max_rating": None},
    "exciting": {"boost": ["Action", "Adventure", "Thriller"], "penalize": [], "max_rating": None},
    "adventure": {"boost": ["Adventure", "Action", "Fantasy", "Sci-Fi"], "penalize": [], "max_rating": None},
    "intense": {"boost": ["Action", "Thriller", "Drama", "War"], "penalize": [], "max_rating": None},
    "romantic": {"boost": ["Romance", "Comedy", "Drama"], "penalize": ["Horror", "War"], "max_rating": None},
    "romance": {"boost": ["Romance", "Comedy", "Drama"], "penalize": [], "max_rating": None},
    "date night": {"boost": ["Romance", "Comedy", "Drama", "Thriller"], "penalize": [], "max_rating": None},
    "date": {"boost": ["Romance", "Comedy", "Drama"], "penalize": [], "max_rating": None},
    "sad": {"boost": ["Drama", "Romance"], "penalize": ["Comedy", "Animation"], "max_rating": None},
    "cry": {"boost": ["Drama", "Romance", "War"], "penalize": [], "max_rating": None},
    "drama": {"boost": ["Drama"], "penalize": [], "max_rating": None},
    "sci-fi": {"boost": ["Science Fiction", "Sci-Fi", "Fantasy"], "penalize": [], "max_rating": None},
    "space": {"boost": ["Science Fiction", "Sci-Fi"], "penalize": [], "max_rating": None},
    "fantasy": {"boost": ["Fantasy", "Adventure"], "penalize": [], "max_rating": None},
    "mystery": {"boost": ["Mystery", "Thriller", "Crime"], "penalize": [], "max_rating": None},
    "crime": {"boost": ["Crime", "Thriller", "Mystery"], "penalize": [], "max_rating": None},
    "documentary": {"boost": ["Documentary"], "penalize": [], "max_rating": None},
    "war": {"boost": ["War", "History", "Drama"], "penalize": [], "max_rating": None},
    "classic": {"boost": [], "penalize": [], "max_rating": None},
    "animated": {"boost": ["Animation"], "penalize": [], "max_rating": None},
    "anime": {"boost": ["Animation"], "penalize": [], "max_rating": None},
    "music": {"boost": ["Music", "Musical"], "penalize": [], "max_rating": None},
    "musical": {"boost": ["Music", "Musical"], "penalize": [], "max_rating": None},
    "western": {"boost": ["Western"], "penalize": [], "max_rating": None},
    "superhero": {"boost": ["Action", "Adventure", "Science Fiction"], "penalize": [], "max_rating": None},
}

# Content rating hierarchy for family filtering
RATING_ORDER = ["G", "PG", "PG-13", "R", "NC-17", "NR", "Not Rated", None]


def _parse_decade(mood: str) -> tuple[int, int] | None:
    """Extract decade filter from mood text."""
    match = re.search(r"\b(19|20)(\d)0s\b", mood.lower())
    if match:
        decade_start = int(match.group(1) + match.group(2) + "0")
        return (decade_start, decade_start + 9)

    match = re.search(r"\b(old|classic|vintage|retro)\b", mood.lower())
    if match:
        return (1920, 1989)

    return None


def _is_rating_appropriate(content_rating: str | None, max_rating: str | None) -> bool:
    """Check if a movie's content rating is at or below the max allowed."""
    if max_rating is None:
        return True
    if content_rating is None:
        return True  # Unknown rating, let it through

    try:
        movie_idx = RATING_ORDER.index(content_rating)
        max_idx = RATING_ORDER.index(max_rating)
        return movie_idx <= max_idx
    except ValueError:
        return True  # Unknown rating format, let it through


def _parse_movie(raw: dict) -> Movie:
    """Convert a raw DB row dict into a Movie model."""
    return Movie(
        jellyfin_id=raw["jellyfin_id"],
        title=raw["title"],
        sort_title=raw.get("sort_title"),
        year=raw.get("year"),
        genres=json.loads(raw.get("genres") or "[]"),
        overview=raw.get("overview"),
        community_rating=raw.get("community_rating"),
        critic_rating=raw.get("critic_rating"),
        runtime_minutes=raw.get("runtime_minutes"),
        content_rating=raw.get("content_rating"),
        studios=json.loads(raw.get("studios") or "[]"),
        people=json.loads(raw.get("people") or "[]"),
        tags=json.loads(raw.get("tags") or "[]"),
    )


def prefilter_candidates(movies_raw: list[dict], mood: str, max_candidates: int = 200, kid_friendly: bool = False) -> list[Movie]:
    """Score and filter movies based on mood signals. Returns top candidates as Movie models."""
    mood_lower = mood.lower()

    # Collect all active signals
    boost_genres: set[str] = set()
    penalize_genres: set[str] = set()
    max_rating: str | None = None
    decade = _parse_decade(mood)

    # Kid-friendly toggle overrides
    if kid_friendly:
        max_rating = "PG-13"
        boost_genres.update(["Family", "Animation", "Comedy", "Adventure"])
        penalize_genres.update(["Horror", "Thriller"])

    for keyword, signals in MOOD_SIGNALS.items():
        if keyword in mood_lower:
            boost_genres.update(signals["boost"])
            penalize_genres.update(signals["penalize"])
            if signals["max_rating"] and (max_rating is None or RATING_ORDER.index(signals["max_rating"]) < RATING_ORDER.index(max_rating)):
                max_rating = signals["max_rating"]

    # Remove any genres that appear in both boost and penalize
    penalize_genres -= boost_genres

    scored: list[tuple[float, dict]] = []

    for raw in movies_raw:
        movie_genres = set(json.loads(raw.get("genres") or "[]"))
        content_rating = raw.get("content_rating")

        # Filter by content rating
        if not _is_rating_appropriate(content_rating, max_rating):
            continue

        # Filter by decade
        year = raw.get("year")
        if decade and year:
            if year < decade[0] or year > decade[1]:
                continue

        # Score the movie
        score = 0.0

        # Genre match bonus
        if boost_genres:
            genre_overlap = len(movie_genres & boost_genres)
            score += genre_overlap * 3.0

        # Genre penalty
        if penalize_genres:
            penalty_overlap = len(movie_genres & penalize_genres)
            score -= penalty_overlap * 2.0

        # Rating bonus (higher rated movies get a small boost)
        rating = raw.get("community_rating")
        if rating:
            score += rating * 0.3

        # Keyword match in overview
        overview = (raw.get("overview") or "").lower()
        mood_words = [w for w in mood_lower.split() if len(w) > 3]
        for word in mood_words:
            if word in overview:
                score += 1.0

        scored.append((score, raw))

    # Sort by score descending
    scored.sort(key=lambda x: x[0], reverse=True)

    # Return top candidates as Movie models
    return [_parse_movie(raw) for _, raw in scored[:max_candidates]]
Initial commit — Movie Night media discovery app 2026-03-14 19:20:56 -07:00			`import json`
			`import re`

			`from app.models import Movie`

			`# Mood signal → genre boosts and filters`
			`MOOD_SIGNALS = {`
			`"kids": {"boost": ["Family", "Animation", "Comedy", "Adventure"], "penalize": ["Horror", "Thriller"], "max_rating": "PG-13"},`
			`"children": {"boost": ["Family", "Animation", "Comedy", "Adventure"], "penalize": ["Horror", "Thriller"], "max_rating": "PG"},`
			`"family": {"boost": ["Family", "Animation", "Comedy", "Adventure"], "penalize": ["Horror", "Thriller"], "max_rating": "PG-13"},`
			`"scary": {"boost": ["Horror", "Thriller", "Mystery"], "penalize": [], "max_rating": None},`
			`"horror": {"boost": ["Horror", "Thriller"], "penalize": [], "max_rating": None},`
			`"spooky": {"boost": ["Horror", "Thriller", "Mystery", "Fantasy"], "penalize": [], "max_rating": None},`
			`"creepy": {"boost": ["Horror", "Thriller", "Mystery"], "penalize": [], "max_rating": None},`
			`"funny": {"boost": ["Comedy"], "penalize": ["Horror", "War"], "max_rating": None},`
			`"comedy": {"boost": ["Comedy"], "penalize": [], "max_rating": None},`
			`"laugh": {"boost": ["Comedy"], "penalize": [], "max_rating": None},`
			`"light": {"boost": ["Comedy", "Romance", "Animation", "Family"], "penalize": ["Horror", "Thriller", "War"], "max_rating": None},`
			`"fun": {"boost": ["Comedy", "Adventure", "Animation", "Action"], "penalize": ["Horror", "War"], "max_rating": None},`
			`"feel-good": {"boost": ["Comedy", "Romance", "Family", "Animation"], "penalize": ["Horror", "Thriller", "War"], "max_rating": None},`
			`"relax": {"boost": ["Comedy", "Romance", "Drama"], "penalize": ["Horror", "Thriller", "Action"], "max_rating": None},`
			`"action": {"boost": ["Action", "Adventure", "Sci-Fi", "Thriller"], "penalize": [], "max_rating": None},`
			`"exciting": {"boost": ["Action", "Adventure", "Thriller"], "penalize": [], "max_rating": None},`
			`"adventure": {"boost": ["Adventure", "Action", "Fantasy", "Sci-Fi"], "penalize": [], "max_rating": None},`
			`"intense": {"boost": ["Action", "Thriller", "Drama", "War"], "penalize": [], "max_rating": None},`
			`"romantic": {"boost": ["Romance", "Comedy", "Drama"], "penalize": ["Horror", "War"], "max_rating": None},`
			`"romance": {"boost": ["Romance", "Comedy", "Drama"], "penalize": [], "max_rating": None},`
			`"date night": {"boost": ["Romance", "Comedy", "Drama", "Thriller"], "penalize": [], "max_rating": None},`
			`"date": {"boost": ["Romance", "Comedy", "Drama"], "penalize": [], "max_rating": None},`
			`"sad": {"boost": ["Drama", "Romance"], "penalize": ["Comedy", "Animation"], "max_rating": None},`
			`"cry": {"boost": ["Drama", "Romance", "War"], "penalize": [], "max_rating": None},`
			`"drama": {"boost": ["Drama"], "penalize": [], "max_rating": None},`
			`"sci-fi": {"boost": ["Science Fiction", "Sci-Fi", "Fantasy"], "penalize": [], "max_rating": None},`
			`"space": {"boost": ["Science Fiction", "Sci-Fi"], "penalize": [], "max_rating": None},`
			`"fantasy": {"boost": ["Fantasy", "Adventure"], "penalize": [], "max_rating": None},`
			`"mystery": {"boost": ["Mystery", "Thriller", "Crime"], "penalize": [], "max_rating": None},`
			`"crime": {"boost": ["Crime", "Thriller", "Mystery"], "penalize": [], "max_rating": None},`
			`"documentary": {"boost": ["Documentary"], "penalize": [], "max_rating": None},`
			`"war": {"boost": ["War", "History", "Drama"], "penalize": [], "max_rating": None},`
			`"classic": {"boost": [], "penalize": [], "max_rating": None},`
			`"animated": {"boost": ["Animation"], "penalize": [], "max_rating": None},`
			`"anime": {"boost": ["Animation"], "penalize": [], "max_rating": None},`
			`"music": {"boost": ["Music", "Musical"], "penalize": [], "max_rating": None},`
			`"musical": {"boost": ["Music", "Musical"], "penalize": [], "max_rating": None},`
			`"western": {"boost": ["Western"], "penalize": [], "max_rating": None},`
			`"superhero": {"boost": ["Action", "Adventure", "Science Fiction"], "penalize": [], "max_rating": None},`
			`}`

			`# Content rating hierarchy for family filtering`
			`RATING_ORDER = ["G", "PG", "PG-13", "R", "NC-17", "NR", "Not Rated", None]`


			`def _parse_decade(mood: str) -> tuple[int, int] \| None:`
			`"""Extract decade filter from mood text."""`
			`match = re.search(r"\b(19\|20)(\d)0s\b", mood.lower())`
			`if match:`
			`decade_start = int(match.group(1) + match.group(2) + "0")`
			`return (decade_start, decade_start + 9)`

			`match = re.search(r"\b(old\|classic\|vintage\|retro)\b", mood.lower())`
			`if match:`
			`return (1920, 1989)`

			`return None`


			`def _is_rating_appropriate(content_rating: str \| None, max_rating: str \| None) -> bool:`
			`"""Check if a movie's content rating is at or below the max allowed."""`
			`if max_rating is None:`
			`return True`
			`if content_rating is None:`
			`return True # Unknown rating, let it through`

			`try:`
			`movie_idx = RATING_ORDER.index(content_rating)`
			`max_idx = RATING_ORDER.index(max_rating)`
			`return movie_idx <= max_idx`
			`except ValueError:`
			`return True # Unknown rating format, let it through`


			`def _parse_movie(raw: dict) -> Movie:`
			`"""Convert a raw DB row dict into a Movie model."""`
			`return Movie(`
			`jellyfin_id=raw["jellyfin_id"],`
			`title=raw["title"],`
			`sort_title=raw.get("sort_title"),`
			`year=raw.get("year"),`
			`genres=json.loads(raw.get("genres") or "[]"),`
			`overview=raw.get("overview"),`
			`community_rating=raw.get("community_rating"),`
			`critic_rating=raw.get("critic_rating"),`
			`runtime_minutes=raw.get("runtime_minutes"),`
			`content_rating=raw.get("content_rating"),`
			`studios=json.loads(raw.get("studios") or "[]"),`
			`people=json.loads(raw.get("people") or "[]"),`
			`tags=json.loads(raw.get("tags") or "[]"),`
			`)`


Add runtime filter, kid-friendly toggle, surprise me, and re-roll 2026-03-14 20:07:05 -07:00			`def prefilter_candidates(movies_raw: list[dict], mood: str, max_candidates: int = 200, kid_friendly: bool = False) -> list[Movie]:`
Initial commit — Movie Night media discovery app 2026-03-14 19:20:56 -07:00			`"""Score and filter movies based on mood signals. Returns top candidates as Movie models."""`
			`mood_lower = mood.lower()`

			`# Collect all active signals`
			`boost_genres: set[str] = set()`
			`penalize_genres: set[str] = set()`
			`max_rating: str \| None = None`
			`decade = _parse_decade(mood)`

Add runtime filter, kid-friendly toggle, surprise me, and re-roll 2026-03-14 20:07:05 -07:00			`# Kid-friendly toggle overrides`
			`if kid_friendly:`
			`max_rating = "PG-13"`
			`boost_genres.update(["Family", "Animation", "Comedy", "Adventure"])`
			`penalize_genres.update(["Horror", "Thriller"])`

Initial commit — Movie Night media discovery app 2026-03-14 19:20:56 -07:00			`for keyword, signals in MOOD_SIGNALS.items():`
			`if keyword in mood_lower:`
			`boost_genres.update(signals["boost"])`
			`penalize_genres.update(signals["penalize"])`
			`if signals["max_rating"] and (max_rating is None or RATING_ORDER.index(signals["max_rating"]) < RATING_ORDER.index(max_rating)):`
			`max_rating = signals["max_rating"]`

			`# Remove any genres that appear in both boost and penalize`
			`penalize_genres -= boost_genres`

			`scored: list[tuple[float, dict]] = []`

			`for raw in movies_raw:`
			`movie_genres = set(json.loads(raw.get("genres") or "[]"))`
			`content_rating = raw.get("content_rating")`

			`# Filter by content rating`
			`if not _is_rating_appropriate(content_rating, max_rating):`
			`continue`

			`# Filter by decade`
			`year = raw.get("year")`
			`if decade and year:`
			`if year < decade[0] or year > decade[1]:`
			`continue`

			`# Score the movie`
			`score = 0.0`

			`# Genre match bonus`
			`if boost_genres:`
			`genre_overlap = len(movie_genres & boost_genres)`
			`score += genre_overlap * 3.0`

			`# Genre penalty`
			`if penalize_genres:`
			`penalty_overlap = len(movie_genres & penalize_genres)`
			`score -= penalty_overlap * 2.0`

			`# Rating bonus (higher rated movies get a small boost)`
			`rating = raw.get("community_rating")`
			`if rating:`
			`score += rating * 0.3`

			`# Keyword match in overview`
			`overview = (raw.get("overview") or "").lower()`
			`mood_words = [w for w in mood_lower.split() if len(w) > 3]`
			`for word in mood_words:`
			`if word in overview:`
			`score += 1.0`

			`scored.append((score, raw))`

			`# Sort by score descending`
			`scored.sort(key=lambda x: x[0], reverse=True)`

			`# Return top candidates as Movie models`
			`return [_parse_movie(raw) for _, raw in scored[:max_candidates]]`