cmd
winget install ffmpeg
check if it installed ok:
ffmpeg -version
in python terminal:
pip install openai-whisper pyaudio numpy wave
you may also need:
pip install openai-whisper pyaudio numpy
pip install pipwin
pipwin install pyaudio
winget install ffmpeg
check if it installed ok:
ffmpeg -version
in python terminal:
pip install openai-whisper pyaudio numpy wave
you may also need:
pip install openai-whisper pyaudio numpy
pip install pipwin
pipwin install pyaudio
Python:
import whisper
import pyaudio
import numpy as np
import re
from threading import Event
# Audio Settings
CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000
MIN_ACTIVE_SECONDS = 0.5 # Minimum speech duration to process
exit_event = Event()
def calibrate_mic(stream, calibrate_seconds=2):
"""Auto-set silence threshold by sampling ambient noise"""
print(f"Calibrating mic (stay silent for {calibrate_seconds}s)...")
samples = []
for _ in range(int(RATE / CHUNK * calibrate_seconds)):
data = stream.read(CHUNK, exception_on_overflow=False)
samples.append(np.abs(np.frombuffer(data, dtype=np.int16)).mean())
return max(np.mean(samples) * 1.5, 100) # Ensure minimum threshold of 100
def clean_text(text):
"""Remove repeated phrases and gibberish"""
text = re.sub(r'(\b\w+\b)(?:\s+\1\b)+', r'\1', text) # Remove repeats
return text.strip() if text.strip() and len(text.split()) >= 1 else ""
def record_chunk(stream, silence_threshold):
"""Record until silence is detected"""
frames = []
silent_frames = 0
max_silent_frames = int(RATE / CHUNK * 1.5) # 1.5s silence = stop
while not exit_event.is_set():
data = stream.read(CHUNK, exception_on_overflow=False)
audio_data = np.frombuffer(data, dtype=np.int16)
volume = np.abs(audio_data).mean()
if volume < silence_threshold:
silent_frames += 1
if silent_frames > max_silent_frames:
break
else:
silent_frames = 0
frames.append(audio_data)
return b''.join(frames) if len(frames) > int(RATE / CHUNK * MIN_ACTIVE_SECONDS) else None
def transcribe_chunk(model, audio_bytes):
"""Convert audio bytes to text"""
audio_np = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32768.0
result = model.transcribe(audio_np, fp16=False, language='en')
return clean_text(result["text"])
def main():
print("Initializing...")
model = whisper.load_model("base") # or "tiny"
p = pyaudio.PyAudio()
stream = p.open(
format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
frames_per_buffer=CHUNK,
input_device_index=None # Auto-select default mic
)
# Calibrate microphone
silence_threshold = calibrate_mic(stream)
print(f"Silence threshold set to: {silence_threshold:.2f}")
print("\nSpeak now (Press Ctrl+C to stop):")
last_text = ""
try:
while not exit_event.is_set():
audio_data = record_chunk(stream, silence_threshold)
if audio_data:
text = transcribe_chunk(model, audio_data)
if text and text != last_text:
print(f"> {text}")
last_text = text
except KeyboardInterrupt:
pass
finally:
exit_event.set()
stream.stop_stream()
stream.close()
p.terminate()
print("\nStopped.")
if __name__ == "__main__":
main()