A minimal, clean speech-to-text menu bar application for macOS using OpenAI's Whisper.

5 min read Original article ↗
#!/usr/bin/env python3 """ Minimal Speech-to-Text Menu Bar App Clean, simple voice transcription using Whisper. """ # /// script # dependencies = [ # "faster-whisper>=1.0.0", # "pyobjc-framework-Cocoa>=10.0", # "pyaudio>=0.2.13", # ] # /// import threading import tempfile import os from Foundation import NSObject, NSTimer, NSOperationQueue from AppKit import ( NSApplication, NSStatusBar, NSMenu, NSMenuItem, NSVariableStatusItemLength, NSOnState, NSOffState, NSEventModifierFlagCommand, NSEventModifierFlagOption, NSApplicationActivationPolicyAccessory, NSPasteboard, NSStringPboardType, NSEvent, NSEventMaskFlagsChanged ) from PyObjCTools import AppHelper import pyaudio import wave from faster_whisper import WhisperModel class WhisperMenuBar(NSObject): """Minimal speech-to-text menu bar application""" def applicationDidFinishLaunching_(self, notification): """Initialize the application""" print("Starting Whisper Menu Bar...") # Set as accessory app (no dock icon) app = NSApplication.sharedApplication() app.setActivationPolicy_(NSApplicationActivationPolicyAccessory) # Initialize state self.recording = False self.audio_frames = [] self.whisper_model = None self.model_loaded = False self.whisper_model_name = "base" self.modifier_pressed = False self.trigger_modifier = NSEventModifierFlagOption # Option key # Audio settings self.audio_format = pyaudio.paInt16 self.channels = 1 self.rate = 16000 self.chunk = 1024 self.audio_stream = None # Initialize audio try: self.pyaudio_instance = pyaudio.PyAudio() print("Audio initialized") except Exception as e: print(f"Error: Audio initialization failed: {e}") self.pyaudio_instance = None # Setup menu bar self._setup_menu_bar() # Setup push-to-talk (Option key monitoring) self._setup_push_to_talk() # Load model in background threading.Thread(target=self._load_model, daemon=True).start() def _setup_menu_bar(self): """Create status bar icon and menu""" # Create status bar item statusBar = NSStatusBar.systemStatusBar() self.statusItem = statusBar.statusItemWithLength_(NSVariableStatusItemLength) self.statusItem.setTitle_("🎤") # Create menu menu = NSMenu.alloc().init() # Status item (shows current state) self.statusMenuItem = NSMenuItem.alloc().initWithTitle_action_keyEquivalent_( "Ready", None, "" ) self.statusMenuItem.setEnabled_(False) menu.addItem_(self.statusMenuItem) menu.addItem_(NSMenuItem.separatorItem()) # Push-to-talk only (no explicit start/stop menu item) menu.addItem_(NSMenuItem.separatorItem()) # Model selection modelMenuItem = NSMenuItem.alloc().initWithTitle_action_keyEquivalent_( "Model", None, "" ) modelMenu = NSMenu.alloc().init() self.modelMenuItems = {} for model_name in ["tiny", "base", "small", "medium"]: item = NSMenuItem.alloc().initWithTitle_action_keyEquivalent_( model_name.capitalize(), "changeModel:", "" ) item.setTarget_(self) item.setRepresentedObject_(model_name) if model_name == "base": item.setState_(NSOnState) modelMenu.addItem_(item) self.modelMenuItems[model_name] = item modelMenuItem.setSubmenu_(modelMenu) menu.addItem_(modelMenuItem) menu.addItem_(NSMenuItem.separatorItem()) # Quit quitItem = NSMenuItem.alloc().initWithTitle_action_keyEquivalent_( "Quit", "terminate:", "q" ) quitItem.setKeyEquivalentModifierMask_(NSEventModifierFlagCommand) menu.addItem_(quitItem) self.statusItem.setMenu_(menu) def _setup_push_to_talk(self): """Setup push-to-talk with Option key""" # Global monitoring for modifier flags (works even when app doesn't have focus) self.flagsChangedMonitor = NSEvent.addGlobalMonitorForEventsMatchingMask_handler_( NSEventMaskFlagsChanged, self._handle_flags_changed ) # Local monitoring for modifier flags (when app has focus) self.localFlagsChangedMonitor = NSEvent.addLocalMonitorForEventsMatchingMask_handler_( NSEventMaskFlagsChanged, self._handle_local_flags_changed ) print("Push-to-talk enabled: Hold Option key to record") def _handle_flags_changed(self, event): """Handle global modifier key changes""" modifierFlags = event.modifierFlags() trigger_pressed = bool(modifierFlags & self.trigger_modifier) # Option key pressed - start recording if trigger_pressed and not self.modifier_pressed and not self.recording: self.modifier_pressed = True def start(): self._start_recording() NSOperationQueue.mainQueue().addOperationWithBlock_(start) # Option key released - stop recording elif not trigger_pressed and self.modifier_pressed and self.recording: self.modifier_pressed = False def stop(): self._stop_recording() NSOperationQueue.mainQueue().addOperationWithBlock_(stop) else: self.modifier_pressed = trigger_pressed def _handle_local_flags_changed(self, event): """Handle local modifier key changes""" modifierFlags = event.modifierFlags() trigger_pressed = bool(modifierFlags & self.trigger_modifier) # Option key pressed - start recording if trigger_pressed and not self.modifier_pressed and not self.recording: self.modifier_pressed = True self._start_recording() return None # Consume event # Option key released - stop recording elif not trigger_pressed and self.modifier_pressed and self.recording: self.modifier_pressed = False self._stop_recording() return None # Consume event else: self.modifier_pressed = trigger_pressed return event # Pass through def _load_model(self): """Load Whisper model""" try: self._update_status("Loading model...") print(f"Loading faster-whisper model: {self.whisper_model_name}") # Load model with faster-whisper (uses CTranslate2 for speed) # compute_type="int8" for best performance on CPU self.whisper_model = WhisperModel( self.whisper_model_name, device="cpu", compute_type="int8" ) self.model_loaded = True self._update_status("Ready") print(f"Model loaded: {self.whisper_model_name}") except Exception as e: print(f"Error: Model loading failed: {e}") self._update_status("Model load failed") self.model_loaded = False def _update_status(self, text): """Update status menu item (thread-safe)""" def update(): self.statusMenuItem.setTitle_(text) NSOperationQueue.mainQueue().addOperationWithBlock_(update) def changeModel_(self, sender): """Change Whisper model""" new_model = sender.representedObject() if new_model == self.whisper_model_name: return # Update checkmarks for name, item in self.modelMenuItems.items(): item.setState_(NSOnState if name == new_model else NSOffState) # Reload model self.whisper_model_name = new_model self.model_loaded = False threading.Thread(target=self._load_model, daemon=True).start() def _start_recording(self): """Start recording audio""" if not self.pyaudio_instance: print("Error: Audio not available") return if not self.model_loaded: print("Error: Model not loaded") return try: self.recording = True self.audio_frames = [] # Audio callback def audio_callback(in_data, frame_count, time_info, status): if self.recording: self.audio_frames.append(in_data) return (in_data, pyaudio.paContinue) # Open stream self.audio_stream = self.pyaudio_instance.open( format=self.audio_format, channels=self.channels, rate=self.rate, input=True, frames_per_buffer=self.chunk, stream_callback=audio_callback ) self.audio_stream.start_stream() # Update UI self._update_status("Recording...") self.statusItem.setTitle_("🔴") # No menu label change needed print("Recording started") except Exception as e: print(f"Error: Recording failed: {e}") self.recording = False def _stop_recording(self): """Stop recording and transcribe""" if not self.recording: return try: self.recording = False # Stop stream if self.audio_stream: self.audio_stream.stop_stream() self.audio_stream.close() self.audio_stream = None # Update UI self._update_status("Transcribing...") self.statusItem.setTitle_("🎤") # No menu label change needed print("Recording stopped") # Transcribe in background frames = self.audio_frames.copy() threading.Thread(target=lambda: self._transcribe_audio(frames), daemon=True).start() except Exception as e: print(f"Error: Stop recording failed: {e}") def _transcribe_audio(self, frames): """Transcribe audio""" try: if not frames: print("Error: No audio data") self._update_status("No audio recorded") return # Save to temp file with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_file: temp_path = temp_file.name wf = wave.open(temp_path, 'wb') wf.setnchannels(self.channels) wf.setsampwidth(self.pyaudio_instance.get_sample_size(self.audio_format)) wf.setframerate(self.rate) wf.writeframes(b''.join(frames)) wf.close() print(f"Transcribing audio file: {temp_path}") # Transcribe with faster-whisper # Returns segments and info (iterator) segments, info = self.whisper_model.transcribe(temp_path) # Extract text from segments text = " ".join([segment.text for segment in segments]).strip() # Cleanup os.unlink(temp_path) if text: print(f"Transcription: {text}") # Copy to clipboard self._copy_to_clipboard(text) self._update_status("Done - Copied to clipboard") # Show result in console print(f"Result copied to clipboard: {text}") else: print("No speech detected") self._update_status("No speech detected") except Exception as e: print(f"Error: Transcription failed: {e}") self._update_status("Transcription failed") def _copy_to_clipboard(self, text): """Copy text to clipboard""" pasteboard = NSPasteboard.generalPasteboard() pasteboard.clearContents() pasteboard.setString_forType_(text, NSStringPboardType) def applicationWillTerminate_(self, notification): """Cleanup on quit""" print("Shutting down...") # Remove event monitors if hasattr(self, 'flagsChangedMonitor') and self.flagsChangedMonitor: NSEvent.removeMonitor_(self.flagsChangedMonitor) if hasattr(self, 'localFlagsChangedMonitor') and self.localFlagsChangedMonitor: NSEvent.removeMonitor_(self.localFlagsChangedMonitor) if self.audio_stream: self.audio_stream.stop_stream() self.audio_stream.close() if self.pyaudio_instance: self.pyaudio_instance.terminate() if __name__ == "__main__": print("Initializing Whisper Menu Bar App...") app = NSApplication.sharedApplication() delegate = WhisperMenuBar.alloc().init() app.setDelegate_(delegate) print("Starting...") AppHelper.runEventLoop()