"""
Voice Analysis source — the open replacement for iMotions' Voice Analysis
(AssemblyAI / audEERING), HARDWARE.md tier=ml.

Per audio window it publishes TWO LSL streams the recorder captures:
  * "Voice"      — paralinguistic features (loudness, pitch, jitter, shimmer) from
                   openSMILE (eGeMAPS), numeric, one sample per window.
  * "VoiceText"  — Whisper speech-to-text transcripts, as a string/marker stream,
                   so spoken words are event-locked exactly like stimulus markers.

Whisper and openSMILE are offline and free (no AssemblyAI dependency). Both are
lazy-imported and fully injectable, so this runs and tests with no audio stack.
"""

from __future__ import annotations

import time
from typing import Callable

from pylsl import StreamOutlet, local_clock

from ..core.source import StreamSpec
from ..drivers._base import DriverSource

FEATURES = ["loudness", "pitch_hz", "jitter", "shimmer"]

# openSMILE eGeMAPSv02 functional names -> our channels (with fallbacks).
_SMILE_MAP = {
    "loudness": ["loudness_sma3_amean", "Loudness_sma3_amean"],
    "pitch_hz": ["F0semitoneFrom27.5Hz_sma3nz_amean", "F0_sma3nz_amean"],
    "jitter": ["jitterLocal_sma3nz_amean"],
    "shimmer": ["shimmerLocaldB_sma3nz_amean"],
}


class VoiceSource(DriverSource):
    """
    kwargs (via drivers.make):
      mic          : input device index (default 0).
      block        : analysis window seconds (default 2.0).
      sr           : sample rate (default 16000, Whisper-native).
      audio_source : callable -> mono float32 ndarray | None. Overrides the mic
                     (tests / file playback / external capture).
      smile        : callable(audio, sr) -> dict of features. Overrides openSMILE.
      transcribe   : callable(audio, sr) -> str. Overrides Whisper.
      whisper_model: model size for Whisper (default "base").
    """

    def __init__(self, device, *, mic=0, block=2.0, sr=16000,
                 audio_source: Callable | None = None,
                 smile: Callable | None = None,
                 transcribe: Callable | None = None,
                 whisper_model="base", **opts):
        object.__setattr__(device, "channels", tuple(FEATURES)) \
            if not device.channels else None
        super().__init__(device, mic=mic, block=block, sr=sr, **opts)
        self._audio_source = audio_source
        self._smile = smile
        self._transcribe = transcribe
        self._whisper_model = whisper_model
        self._text_outlet: StreamOutlet | None = None

    # --- lazy engines (overridable) --------------------------------------
    def _get_smile(self):
        if self._smile is None:
            import opensmile  # lazy
            sm = opensmile.Smile(feature_set=opensmile.FeatureSet.eGeMAPSv02,
                                 feature_level=opensmile.FeatureLevel.Functionals)
            self._smile = lambda audio, sr: _smile_to_dict(sm.process_signal(audio, sr))
        return self._smile

    def _get_transcribe(self):
        if self._transcribe is None:
            import whisper  # lazy
            model = whisper.load_model(self._whisper_model)
            self._transcribe = lambda audio, sr: model.transcribe(
                audio, fp16=False).get("text", "").strip()
        return self._transcribe

    # --- loop ------------------------------------------------------------
    def read(self):
        sr = int(self.opts["sr"]); block = float(self.opts["block"])
        grab = self._audio_source or _mic_grabber(self.opts["mic"], sr, block)
        smile = self._get_smile()
        transcribe = self._get_transcribe()
        # second outlet for transcripts (recorder resolves it like any stream)
        self._text_outlet = StreamOutlet(StreamSpec(
            name="VoiceText", stype="Markers", channels=["transcript"],
            nominal_srate=0.0, channel_format="string",
            source_id="voice-text-0").to_lsl())
        try:
            while not self.stopping:
                audio = grab()
                if audio is None or len(audio) == 0:
                    time.sleep(0.05)
                    continue
                feats = smile(audio, sr)
                yield [float(feats.get(f, 0.0)) for f in FEATURES], local_clock()
                text = transcribe(audio, sr)
                if text:
                    self._text_outlet.push_sample([f"speech:{text}"], local_clock())
        finally:
            closer = getattr(grab, "close", None)
            if closer:
                closer()


# --------------------------------------------------------------------------
def _smile_to_dict(df) -> dict:
    row = df.iloc[0]
    out = {}
    for ch, names in _SMILE_MAP.items():
        for n in names:
            if n in row.index:
                out[ch] = float(row[n]); break
    return out


def _mic_grabber(mic, sr, block):
    import sounddevice as sd  # lazy; needs PortAudio + a device
    import numpy as np
    frames = int(sr * block)

    def grab():
        audio = sd.rec(frames, samplerate=sr, channels=1, dtype="float32",
                       device=mic)
        sd.wait()
        return audio.reshape(-1).astype(np.float32)

    return grab
