audio_model.py 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696
  1. """
  2. Audio Model Module
  3. Audio classification model for durian ripeness detection using knock detection and mel-spectrogram features.
  4. Detects knocks in audio using librosa, extracts mel-spectrograms, and averages predictions across knocks.
  5. """
  6. import os
  7. import tempfile
  8. import pickle
  9. import json
  10. from pathlib import Path
  11. from typing import Dict, Any, Tuple, Optional, List
  12. import logging
  13. import numpy as np
  14. import librosa
  15. import librosa.display
  16. import matplotlib
  17. matplotlib.use('agg') # Use non-interactive backend
  18. import matplotlib.pyplot as plt
  19. from matplotlib.backends.backend_qt5agg import FigureCanvasQTAgg as FigureCanvas
  20. from PyQt5.QtGui import QImage, QPixmap
  21. from models.base_model import BaseModel
  22. from utils.config import (
  23. AUDIO_MODEL_PATH,
  24. SPECTROGRAM_FIG_SIZE,
  25. RIPENESS_CLASSES,
  26. )
  27. # Import TensorFlow
  28. try:
  29. import tensorflow as tf
  30. except ImportError:
  31. tf = None
  32. logger = logging.getLogger(__name__)
  33. class AudioModel(BaseModel):
  34. """
  35. Audio-based ripeness classification model.
  36. Detects knocks in durian audio using onset detection, extracts mel-spectrogram features,
  37. and averages predictions across all detected knocks for robust ripeness classification.
  38. Attributes:
  39. model: Keras model for mel-spectrogram classification
  40. label_encoder: Scikit-learn label encoder for class names
  41. preprocessing_stats: JSON statistics (max_length, normalization params)
  42. class_names: List of class names (unripe, ripe, overripe)
  43. """
  44. # Mel-spectrogram parameters (must match training)
  45. MEL_PARAMS = {
  46. 'n_mels': 64,
  47. 'hop_length': 512,
  48. 'n_fft': 2048,
  49. 'sr': 22050
  50. }
  51. # Knock detection parameters
  52. KNOCK_DETECTION = {
  53. 'delta': 0.3, # Onset detection delta
  54. 'wait': 10, # Onset detection wait frames
  55. 'onset_shift': 0.05, # Shift onsets back by 50ms
  56. 'knock_duration': 0.2 # Extract 200ms per knock
  57. }
  58. def __init__(self, model_path: Optional[str] = None, device: str = "cpu"):
  59. """
  60. Initialize the audio model.
  61. Args:
  62. model_path: Path to model directory (optional)
  63. device: Device to use (cpu/gpu - not used for TensorFlow)
  64. """
  65. if model_path is None:
  66. # AUDIO_MODEL_PATH points to models/audio/ which contains our files
  67. model_path = str(AUDIO_MODEL_PATH)
  68. super().__init__(model_path, device)
  69. self.class_names = RIPENESS_CLASSES
  70. self.model = None
  71. self.label_encoder = None
  72. self.preprocessing_stats = None
  73. logger.info(f"AudioModel initialized with model_path: {model_path}")
  74. def load(self) -> bool:
  75. """
  76. Load the model, label encoder, and preprocessing statistics.
  77. Returns:
  78. bool: True if loaded successfully, False otherwise
  79. """
  80. try:
  81. base_dir = Path(self.model_path)
  82. # Try two possible paths: direct path or voice_memos_ripeness subdirectory
  83. possible_dirs = [
  84. base_dir, # Files directly in model_path
  85. base_dir / "voice_memos_ripeness" # Files in voice_memos_ripeness subdir
  86. ]
  87. model_dir = None
  88. for possible_dir in possible_dirs:
  89. if possible_dir.exists():
  90. # Check if this directory has the required files
  91. model_file = possible_dir / "best_model_mel_spec_grouped.keras"
  92. if model_file.exists():
  93. model_dir = possible_dir
  94. break
  95. if model_dir is None:
  96. logger.error(f"Could not find model files in: {base_dir} or {base_dir / 'voice_memos_ripeness'}")
  97. return False
  98. logger.info(f"Loading audio model from {model_dir}")
  99. # Load Keras model
  100. model_path = model_dir / "best_model_mel_spec_grouped.keras"
  101. if not model_path.exists():
  102. logger.error(f"Model file not found: {model_path}")
  103. return False
  104. logger.info(f"Loading TensorFlow model from {model_path}...")
  105. self.model = tf.keras.models.load_model(str(model_path))
  106. logger.info(f"✓ TensorFlow model loaded successfully")
  107. # Load label encoder
  108. encoder_path = model_dir / "label_encoder.pkl"
  109. if not encoder_path.exists():
  110. logger.error(f"Label encoder not found: {encoder_path}")
  111. return False
  112. logger.info(f"Loading label encoder from {encoder_path}...")
  113. with open(encoder_path, 'rb') as f:
  114. self.label_encoder = pickle.load(f)
  115. logger.info(f"✓ Label encoder loaded with classes: {list(self.label_encoder.classes_)}")
  116. # Load preprocessing stats
  117. stats_path = model_dir / "preprocessing_stats.json"
  118. if not stats_path.exists():
  119. logger.error(f"Preprocessing stats not found: {stats_path}")
  120. return False
  121. logger.info(f"Loading preprocessing stats from {stats_path}...")
  122. with open(stats_path, 'r') as f:
  123. self.preprocessing_stats = json.load(f)
  124. logger.info(f"✓ Preprocessing stats loaded, max_length: {self.preprocessing_stats.get('max_length')}")
  125. self._is_loaded = True
  126. logger.info("✓ Audio model loaded successfully")
  127. return True
  128. except Exception as e:
  129. logger.error(f"Failed to load audio model: {e}", exc_info=True)
  130. self._is_loaded = False
  131. return False
  132. def predict(self, audio_path: str) -> Dict[str, Any]:
  133. """
  134. Predict ripeness from an audio file using knock detection and mel-spectrogram analysis.
  135. Args:
  136. audio_path: Path to audio file (supports WAV and other formats via librosa)
  137. Returns:
  138. Dict containing:
  139. - 'class_name': Predicted class name (Ripe/Unripe/Overripe)
  140. - 'class_index': Predicted class index
  141. - 'probabilities': Dictionary of class probabilities (0-1 range)
  142. - 'confidence': Confidence score (0-1 range, averaged across knocks)
  143. - 'spectrogram_image': QPixmap of mel-spectrogram with knocks marked
  144. - 'waveform_image': QPixmap of waveform with knocks marked
  145. - 'knock_count': Number of knocks detected
  146. - 'knock_times': List of knock onset times in seconds
  147. - 'success': Whether prediction succeeded
  148. - 'error': Error message if failed
  149. """
  150. if not self._is_loaded or self.model is None:
  151. raise RuntimeError("Model not loaded. Call load() first.")
  152. try:
  153. # Ensure audio is in WAV format
  154. wav_path = self._ensure_wav_format(audio_path)
  155. # Load audio
  156. logger.info(f"Loading audio from {audio_path}")
  157. y, sr = librosa.load(wav_path, sr=self.MEL_PARAMS['sr'], mono=True)
  158. # Trim silence from beginning and end
  159. cut_samples = int(0.5 * sr)
  160. if len(y) > 2 * cut_samples:
  161. y = y[cut_samples:-cut_samples]
  162. elif len(y) > cut_samples:
  163. y = y[cut_samples:]
  164. # Detect knocks and extract features
  165. logger.info("Detecting knocks in audio...")
  166. features, knock_times = self._extract_knock_features(y, sr)
  167. logger.info(f"DEBUG: Detected {len(features)} knocks")
  168. if len(features) == 0:
  169. logger.error("❌ No knocks detected in audio file - returning error")
  170. return {
  171. 'success': False,
  172. 'class_name': None,
  173. 'class_index': None,
  174. 'probabilities': {},
  175. 'confidence': 0.0,
  176. 'spectrogram_image': None,
  177. 'waveform_image': None,
  178. 'knock_count': 0,
  179. 'knock_times': [],
  180. 'error': 'No knocks detected in audio file'
  181. }
  182. logger.info(f"Detected {len(features)} knocks at times: {knock_times}")
  183. # Prepare features for model
  184. max_length = self.preprocessing_stats['max_length']
  185. X = np.array([
  186. np.pad(f, ((0, max_length - f.shape[0]), (0, 0)), mode='constant')
  187. for f in features
  188. ])
  189. if len(X.shape) == 3:
  190. X = np.expand_dims(X, -1)
  191. # Run inference
  192. logger.info(f"Running model inference on {len(features)} knocks...")
  193. probs = self.model.predict(X, verbose=0)
  194. # Get per-knock predictions
  195. per_knock_preds = []
  196. for i, knock_probs in enumerate(probs):
  197. knock_pred_idx = np.argmax(knock_probs)
  198. knock_pred_class = self.label_encoder.classes_[knock_pred_idx]
  199. knock_confidence = float(knock_probs[knock_pred_idx])
  200. per_knock_preds.append({
  201. 'class': knock_pred_class,
  202. 'class_idx': knock_pred_idx,
  203. 'confidence': knock_confidence,
  204. 'probabilities': {self.label_encoder.classes_[j]: float(knock_probs[j]) for j in range(len(self.label_encoder.classes_))}
  205. })
  206. logger.info(f"Per-knock predictions: {[p['class'] for p in per_knock_preds]}")
  207. # Average predictions across all knocks (CONFIDENCE LOGIC)
  208. avg_probs = np.mean(probs, axis=0)
  209. predicted_idx = np.argmax(avg_probs)
  210. predicted_class = self.label_encoder.classes_[predicted_idx]
  211. confidence = float(avg_probs[predicted_idx])
  212. logger.info(f"Average probabilities: {dict(zip(self.label_encoder.classes_, avg_probs))}")
  213. logger.info(f"Final prediction: {predicted_class} ({confidence:.2%})")
  214. # Create probability dictionary
  215. prob_dict = {
  216. self.label_encoder.classes_[i]: float(avg_probs[i])
  217. for i in range(len(self.label_encoder.classes_))
  218. }
  219. # Capitalize class name for display
  220. predicted_class_display = predicted_class.capitalize() if isinstance(predicted_class, str) else predicted_class
  221. # Generate visualizations with knock annotations
  222. spectrogram_image = self._generate_mel_spectrogram_with_knocks(y, sr, knock_times)
  223. waveform_image = self._generate_waveform_with_knocks(y, sr, knock_times)
  224. logger.info(f"Prediction: {predicted_class_display} ({confidence:.2%}) from {len(features)} knocks")
  225. return {
  226. 'success': True,
  227. 'class_name': predicted_class_display,
  228. 'class_index': predicted_idx,
  229. 'probabilities': prob_dict,
  230. 'confidence': confidence,
  231. 'spectrogram_image': spectrogram_image,
  232. 'waveform_image': waveform_image,
  233. 'knock_count': len(features),
  234. 'knock_times': knock_times,
  235. 'per_knock_predictions': per_knock_preds,
  236. 'error': None
  237. }
  238. except Exception as e:
  239. error_msg = str(e)
  240. logger.error(f"Prediction failed: {error_msg}", exc_info=True)
  241. # Provide helpful error message for audio format issues
  242. if 'convert' in error_msg.lower() or 'format' in error_msg.lower():
  243. error_msg += " - Please ensure ffmpeg is installed: conda install -c conda-forge ffmpeg"
  244. return {
  245. 'success': False,
  246. 'class_name': None,
  247. 'class_index': None,
  248. 'probabilities': {},
  249. 'confidence': 0.0,
  250. 'spectrogram_image': None,
  251. 'waveform_image': None,
  252. 'knock_count': 0,
  253. 'knock_times': [],
  254. 'per_knock_predictions': [],
  255. 'error': error_msg
  256. }
  257. def _ensure_wav_format(self, audio_path: str) -> str:
  258. """
  259. Ensure audio file is in WAV format, converting if necessary.
  260. Supports M4A, MP3, OGG, FLAC, WMA and other formats.
  261. Args:
  262. audio_path: Path to audio file
  263. Returns:
  264. Path to WAV file (original or converted)
  265. """
  266. ext = os.path.splitext(audio_path)[1].lower()
  267. if ext == '.wav':
  268. return audio_path
  269. logger.info(f"Converting {ext} to WAV format...")
  270. # Try ffmpeg first (most reliable for various formats)
  271. try:
  272. import subprocess
  273. tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.wav')
  274. tmp_path = tmp_file.name
  275. tmp_file.close()
  276. # Use ffmpeg to convert
  277. cmd = [
  278. 'ffmpeg',
  279. '-i', audio_path,
  280. '-acodec', 'pcm_s16le',
  281. '-ar', str(self.MEL_PARAMS['sr']),
  282. '-ac', '1', # mono
  283. '-y', # overwrite
  284. tmp_path
  285. ]
  286. logger.info(f"Using ffmpeg to convert {ext}")
  287. subprocess.run(cmd, capture_output=True, check=True, timeout=30)
  288. logger.info(f"Converted to temporary WAV: {tmp_path}")
  289. return tmp_path
  290. except Exception as e:
  291. logger.warning(f"ffmpeg conversion failed: {e}, trying pydub...")
  292. # Try pydub second (handles most formats if installed)
  293. try:
  294. from pydub import AudioSegment
  295. logger.info(f"Using pydub to convert {ext}")
  296. audio = AudioSegment.from_file(audio_path)
  297. tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.wav')
  298. audio.export(tmp_file.name, format='wav')
  299. logger.info(f"Converted to temporary WAV: {tmp_file.name}")
  300. return tmp_file.name
  301. except Exception as e:
  302. logger.warning(f"pydub conversion failed: {e}, trying librosa...")
  303. # Try librosa as final fallback
  304. try:
  305. import soundfile as sf
  306. except ImportError:
  307. logger.warning("soundfile not available, using scipy for conversion")
  308. sf = None
  309. try:
  310. logger.info("Using librosa to load and convert audio")
  311. # Load with librosa (requires ffmpeg backend for non-WAV)
  312. y, sr = librosa.load(audio_path, sr=self.MEL_PARAMS['sr'], mono=True)
  313. tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.wav')
  314. if sf is not None:
  315. sf.write(tmp_file.name, y, sr)
  316. else:
  317. # Fallback: use scipy
  318. import scipy.io.wavfile as wavfile
  319. # Normalize to 16-bit range
  320. y_int16 = np.clip(y * 32767, -32768, 32767).astype(np.int16)
  321. wavfile.write(tmp_file.name, sr, y_int16)
  322. logger.info(f"Converted to temporary WAV: {tmp_file.name}")
  323. return tmp_file.name
  324. except Exception as e:
  325. logger.error(f"Audio conversion failed with all methods: {e}")
  326. logger.error(f"Please ensure ffmpeg is installed or install pydub: pip install pydub")
  327. raise RuntimeError(
  328. f"Failed to convert {ext} audio file. "
  329. "Install ffmpeg or pydub: 'pip install pydub' and 'pip install ffmpeg'"
  330. ) from e
  331. def _extract_knock_features(self, audio: np.ndarray, sr: int) -> Tuple[List[np.ndarray], List[float]]:
  332. """
  333. Detect knocks in audio and extract mel-spectrogram features.
  334. Args:
  335. audio: Audio time series
  336. sr: Sample rate
  337. Returns:
  338. Tuple of:
  339. - List of mel-spectrogram arrays (one per knock)
  340. - List of knock onset times in seconds
  341. """
  342. # Detect onsets (knock starts)
  343. logger.info("Detecting onset times...")
  344. onset_frames = librosa.onset.onset_detect(
  345. y=audio, sr=sr,
  346. delta=self.KNOCK_DETECTION['delta'],
  347. wait=self.KNOCK_DETECTION['wait'],
  348. units='frames'
  349. )
  350. onset_times = librosa.frames_to_time(onset_frames, sr=sr)
  351. if len(onset_times) == 0:
  352. logger.warning("No onsets detected")
  353. return [], []
  354. # Shift onsets slightly back
  355. shifted_times = [max(0, t - self.KNOCK_DETECTION['onset_shift']) for t in onset_times]
  356. logger.info(f"Detected {len(shifted_times)} onset times")
  357. # Extract knock segments
  358. knock_samples = int(round(self.KNOCK_DETECTION['knock_duration'] * sr))
  359. knocks = []
  360. valid_times = []
  361. for onset in shifted_times:
  362. start = int(round(onset * sr))
  363. end = start + knock_samples
  364. if start >= len(audio):
  365. continue
  366. if end <= len(audio):
  367. knock = audio[start:end]
  368. else:
  369. # Pad with zeros if at end
  370. knock = np.zeros(knock_samples, dtype=audio.dtype)
  371. available = len(audio) - start
  372. if available > 0:
  373. knock[:available] = audio[start:]
  374. else:
  375. continue
  376. knocks.append(knock)
  377. valid_times.append(onset)
  378. # Extract mel-spectrograms
  379. logger.info(f"Extracting mel-spectrograms from {len(knocks)} knocks...")
  380. features = []
  381. for knock in knocks:
  382. mel_spec = self._extract_mel_spectrogram(knock, sr)
  383. features.append(mel_spec)
  384. return features, valid_times
  385. def _extract_mel_spectrogram(self, audio: np.ndarray, sr: int) -> np.ndarray:
  386. """
  387. Extract normalized mel-spectrogram from audio.
  388. Args:
  389. audio: Audio segment
  390. sr: Sample rate
  391. Returns:
  392. Normalized mel-spectrogram (time, n_mels)
  393. """
  394. # Compute mel-spectrogram
  395. S = librosa.feature.melspectrogram(
  396. y=audio, sr=sr,
  397. n_mels=self.MEL_PARAMS['n_mels'],
  398. hop_length=self.MEL_PARAMS['hop_length'],
  399. n_fft=self.MEL_PARAMS['n_fft']
  400. )
  401. # Convert to dB scale
  402. S_db = librosa.power_to_db(S, ref=np.max)
  403. # Normalize
  404. std = np.std(S_db)
  405. if std != 0:
  406. S_db = (S_db - np.mean(S_db)) / std
  407. else:
  408. S_db = S_db - np.mean(S_db)
  409. return S_db.T # (time, n_mels)
  410. def predict_batch(self, audio_paths: list) -> list:
  411. """
  412. Predict ripeness for multiple audio files.
  413. Args:
  414. audio_paths: List of paths to audio files
  415. Returns:
  416. List[Dict]: List of prediction results
  417. """
  418. results = []
  419. for audio_path in audio_paths:
  420. result = self.predict(audio_path)
  421. results.append(result)
  422. return results
  423. def _generate_waveform_with_knocks(self, audio: np.ndarray, sr: int, knock_times: List[float]) -> Optional[QPixmap]:
  424. """
  425. Generate waveform visualization with knock locations marked.
  426. Args:
  427. audio: Audio time series
  428. sr: Sample rate
  429. knock_times: List of knock onset times in seconds
  430. Returns:
  431. QPixmap: Waveform plot with knock markers
  432. """
  433. try:
  434. fig, ax = plt.subplots(figsize=SPECTROGRAM_FIG_SIZE)
  435. # Plot waveform
  436. librosa.display.waveshow(audio, sr=sr, alpha=0.6, ax=ax)
  437. # Mark knock locations
  438. knock_duration = self.KNOCK_DETECTION['knock_duration']
  439. for knock_time in knock_times:
  440. # Vertical line at onset
  441. ax.axvline(knock_time, color='red', linestyle='--', alpha=0.8, linewidth=1.5)
  442. # Span showing knock duration
  443. ax.axvspan(knock_time, knock_time + knock_duration, color='orange', alpha=0.2)
  444. ax.set_title(f'Waveform with {len(knock_times)} Detected Knocks')
  445. ax.set_xlabel('Time (s)')
  446. ax.set_ylabel('Amplitude')
  447. ax.grid(True, alpha=0.3)
  448. # Convert to QPixmap
  449. canvas = FigureCanvas(fig)
  450. canvas.draw()
  451. width_px, height_px = fig.get_size_inches() * fig.get_dpi()
  452. width_px, height_px = int(width_px), int(height_px)
  453. img = QImage(canvas.buffer_rgba(), width_px, height_px, QImage.Format_ARGB32)
  454. img = img.rgbSwapped()
  455. pixmap = QPixmap(img)
  456. plt.close(fig)
  457. return pixmap
  458. except Exception as e:
  459. logger.error(f"Failed to generate waveform: {e}")
  460. return None
  461. def _generate_mel_spectrogram_with_knocks(self, audio: np.ndarray, sr: int, knock_times: List[float]) -> Optional[QPixmap]:
  462. """
  463. Generate mel-spectrogram visualization with knock locations marked.
  464. Args:
  465. audio: Audio time series
  466. sr: Sample rate
  467. knock_times: List of knock onset times in seconds
  468. Returns:
  469. QPixmap: Mel-spectrogram plot with knock markers
  470. """
  471. try:
  472. # Compute mel-spectrogram
  473. S = librosa.feature.melspectrogram(
  474. y=audio, sr=sr,
  475. n_mels=self.MEL_PARAMS['n_mels'],
  476. hop_length=self.MEL_PARAMS['hop_length'],
  477. n_fft=self.MEL_PARAMS['n_fft']
  478. )
  479. # Convert to dB scale
  480. S_db = librosa.power_to_db(S, ref=np.max)
  481. # Create figure with tight layout
  482. fig = plt.figure(figsize=SPECTROGRAM_FIG_SIZE)
  483. ax = fig.add_subplot(111)
  484. # Display mel-spectrogram
  485. img = librosa.display.specshow(
  486. S_db,
  487. x_axis='time',
  488. y_axis='mel',
  489. sr=sr,
  490. hop_length=self.MEL_PARAMS['hop_length'],
  491. cmap='magma',
  492. ax=ax
  493. )
  494. # Mark knock locations
  495. knock_duration = self.KNOCK_DETECTION['knock_duration']
  496. for knock_time in knock_times:
  497. # Vertical line at onset
  498. ax.axvline(knock_time, color='cyan', linestyle='--', alpha=0.8, linewidth=1.5)
  499. # Span showing knock duration
  500. ax.axvspan(knock_time, knock_time + knock_duration, color='cyan', alpha=0.15)
  501. ax.set_title(f'Mel Spectrogram with {len(knock_times)} Detected Knocks (64 Coefficients)')
  502. ax.set_xlabel('Time (s)')
  503. ax.set_ylabel('Mel Frequency')
  504. # Add colorbar properly
  505. plt.colorbar(img, ax=ax, format='%+2.0f dB', label='Power (dB)')
  506. plt.tight_layout()
  507. # Convert to QPixmap
  508. canvas = FigureCanvas(fig)
  509. canvas.draw()
  510. width_px, height_px = fig.get_size_inches() * fig.get_dpi()
  511. width_px, height_px = int(width_px), int(height_px)
  512. img_qimage = QImage(canvas.buffer_rgba(), width_px, height_px, QImage.Format_ARGB32)
  513. img_qimage = img_qimage.rgbSwapped()
  514. pixmap = QPixmap(img_qimage)
  515. plt.close(fig)
  516. return pixmap
  517. except Exception as e:
  518. logger.error(f"Failed to generate mel-spectrogram: {e}", exc_info=True)
  519. return None
  520. def _generate_spectrogram_image(self, audio: np.ndarray, sr: int) -> Optional[QPixmap]:
  521. """
  522. Generate a mel-spectrogram visualization from audio.
  523. Args:
  524. audio: Audio time series
  525. sr: Sample rate
  526. Returns:
  527. QPixmap: Rendered mel-spectrogram image or None if failed
  528. """
  529. try:
  530. # Compute mel-spectrogram
  531. S = librosa.feature.melspectrogram(
  532. y=audio, sr=sr,
  533. n_mels=self.MEL_PARAMS['n_mels'],
  534. hop_length=self.MEL_PARAMS['hop_length'],
  535. n_fft=self.MEL_PARAMS['n_fft']
  536. )
  537. # Convert to dB scale
  538. S_db = librosa.power_to_db(S, ref=np.max)
  539. # Create figure
  540. fig, ax = plt.subplots(figsize=SPECTROGRAM_FIG_SIZE)
  541. # Display mel-spectrogram
  542. librosa.display.specshow(
  543. S_db,
  544. x_axis='time',
  545. y_axis='mel',
  546. sr=sr,
  547. hop_length=self.MEL_PARAMS['hop_length'],
  548. cmap='magma',
  549. ax=ax
  550. )
  551. ax.set_title('Mel Spectrogram (64 coefficients)')
  552. ax.set_xlabel('Time (s)')
  553. ax.set_ylabel('Mel Frequency')
  554. # Convert to QPixmap
  555. canvas = FigureCanvas(fig)
  556. canvas.draw()
  557. width_px, height_px = fig.get_size_inches() * fig.get_dpi()
  558. width_px, height_px = int(width_px), int(height_px)
  559. img = QImage(canvas.buffer_rgba(), width_px, height_px, QImage.Format_ARGB32)
  560. img = img.rgbSwapped()
  561. pixmap = QPixmap(img)
  562. plt.close(fig)
  563. return pixmap
  564. except Exception as e:
  565. logger.error(f"Failed to generate spectrogram image: {e}")
  566. return None