mp4-to-text/01_transcribir_archivo_audio.py

"""
===========================================================
INSTALACIÓN DE PYTHON Y DEPENDENCIAS (SISTEMA WINDOWS)
===========================================================

1. INSTALACIÓN DE PYTHON:
   - Descarga Python para Windows desde: https://www.python.org/downloads/windows/
   - Se recomienda instalar Python 3.12 o superior.
   - Durante la instalación, asegúrate de marcar la opción "Add Python to PATH" para que puedas ejecutar Python y pip desde la línea de comandos.

2. CONFIGURACIÓN DEL ENTORNO:
   - Abre una ventana de comandos (CMD o PowerShell).

3. INSTALACIÓN DE DEPENDENCIAS:
   Ejecuta los siguientes comandos para instalar las dependencias necesarias:

   pip install sounddevice         -> Para capturar audio en tiempo real (si se usa).
   pip install numpy               -> Para operaciones numéricas y manejo de arrays.
   pip install openai-whisper      -> Para transcripción con el modelo Whisper.
       * Requiere además:
         - ffmpeg: Descárgalo desde https://ffmpeg.org/download.html#build-windows y agrega la carpeta "bin" al PATH.
         - PyTorch: Instálalo siguiendo las instrucciones en https://pytorch.org/
   pip install pydub               -> Para manipulación y segmentación de audio (requiere ffmpeg instalado y en el PATH).
   pip install SpeechRecognition   -> Para usar Google Speech-to-Text.

4. EJECUCIÓN DEL SCRIPT:
   - Guarda este código en un archivo (por ejemplo, "transcriptor.py").
   - Ejecuta el script desde la línea de comandos con:
         python transcriptor.py
   - Sigue las instrucciones en pantalla para ingresar el código de idioma para Google Speech-to-Text.

Nota: En Windows, para capturar el audio del sistema (lo que se reproduce en el sistema), puede ser necesario configurar el dispositivo WASAPI loopback.
===========================================================
"""

import numpy as np
import wave
import os
import datetime
import whisper
from pydub import AudioSegment
import speech_recognition as sr
import sys

# ================== CONFIGURACIÓN GENERAL ==================
SAMPLE_RATE = 44100              # Frecuencia de muestreo (Hz)
CHUNK_DURATION = 6               # Duración en segundos de cada fragmento a procesar
MIN_VOLUME_THRESHOLD = 0.003     # Umbral de silencio (RMS) para descartar fragmentos muy silenciosos
TARGET_LANGUAGE = None           # Idioma para Whisper (None = detección automática)

# ================== SELECCIÓN INTERACTIVA DE ARCHIVO ==================
extensiones_validas = (".mov", ".mp4", ".m4a", ".3gp", ".3g2", ".mj2")
archivos_audio = [f for f in os.listdir() if f.lower().endswith(extensiones_validas)]

if not archivos_audio:
    print("❌ No se encontraron archivos de audio compatibles en el directorio actual.")
    sys.exit(1)

print("\nArchivos disponibles:")
for i, archivo in enumerate(archivos_audio, 1):
    print(f"{i}. {archivo}")

while True:
    try:
        seleccion = int(input("\nSelecciona el número del archivo a transcribir: "))
        if 1 <= seleccion <= len(archivos_audio):
            audio_filename = archivos_audio[seleccion - 1]
            break
        else:
            print("Número fuera de rango.")
    except ValueError:
        print("Entrada no válida. Usa un número.")

# ================== DEFINICIÓN DEL ARCHIVO DE SALIDA ==================
output_dir = "transcripciones"
os.makedirs(output_dir, exist_ok=True)

nombre_sugerido = os.path.splitext(audio_filename)[0] + ".txt"
transcription_file = input(f"Nombre del archivo de transcripción [Enter = {nombre_sugerido}]: ").strip()
if not transcription_file:
    transcription_file = nombre_sugerido
elif not transcription_file.lower().endswith(".txt"):
    transcription_file += ".txt"

transcription_file = os.path.join(output_dir, transcription_file)

# ================== CARGA DEL MODELO WHISPER ==================
print("\nCargando modelo Whisper...")
model = whisper.load_model("base")  # Puedes usar 'tiny', 'small', 'medium', 'large'
print("✅ Modelo cargado.")

# ================== PROCESAMIENTO DE FRAGMENTOS ==================
def process_chunk(chunk):
    # Calcular RMS (nivel de energía del fragmento) y volumen máximo
    rms = np.sqrt(np.mean(chunk**2))
    max_volume = np.max(np.abs(chunk))
    print(f"RMS: {rms:.5f}, Volumen máximo: {max_volume:.5f}")

    if rms < MIN_VOLUME_THRESHOLD:
        print("Fragmento silencioso. Se omite.")
        return

    # Convertir a formato int16 para guardar como WAV
    chunk_int16 = np.int16(chunk * 32767)
    temp_filename = "temp_audio.wav"

    try:
        with wave.open(temp_filename, 'wb') as wf:
            channels = chunk.shape[1] if chunk.ndim > 1 else 1
            wf.setnchannels(channels)
            wf.setsampwidth(2)
            wf.setframerate(SAMPLE_RATE)
            wf.writeframes(chunk_int16.tobytes())
    except Exception as e:
        print("❌ Error guardando audio temporal:", e)
        return

    chunk_start_time = datetime.datetime.now()

    # Transcripción con Whisper
    try:
        if TARGET_LANGUAGE:
            result_whisper = model.transcribe(temp_filename, language=TARGET_LANGUAGE, fp16=False)
        else:
            result_whisper = model.transcribe(temp_filename, fp16=False)
        text_whisper = result_whisper["text"].strip()
    except Exception as e:
        text_whisper = f"[Error Whisper: {e}]"

    # Transcripción con Google Speech-to-Text
    r = sr.Recognizer()
    try:
        with sr.AudioFile(temp_filename) as source:
            audio_data = r.record(source)
        text_google = r.recognize_google(audio_data, language=google_language)
    except Exception as e:
        text_google = f"[Error Google: {e}]"

    # Guardar resultado con timestamp
    timestamp = chunk_start_time.strftime("%Y-%m-%d %H:%M:%S")
    transcript = f"[{timestamp}] Whisper: {text_whisper}\n[{timestamp}] Google: {text_google}\n"

    print(transcript)
    with open(transcription_file, "a", encoding="utf-8") as f:
        f.write(transcript)

    os.remove(temp_filename)

# ================== EJECUCIÓN PRINCIPAL ==================
if __name__ == "__main__":
    # Preguntar por el idioma de Google STT
    google_language = input("Código de idioma para Google (ej: es-ES, en-US) [Enter = es-ES]: ").strip()
    if not google_language:
        google_language = "es-ES"

    print(f"\n🔊 Cargando archivo: {audio_filename}...")
    audio_segment = AudioSegment.from_file(audio_filename)

    # Reconfigurar si es necesario (mono, 44100 Hz)
    if audio_segment.frame_rate != SAMPLE_RATE:
        audio_segment = audio_segment.set_frame_rate(SAMPLE_RATE)
    if audio_segment.channels != 1:
        audio_segment = audio_segment.set_channels(1)

    samples = np.array(audio_segment.get_array_of_samples()).astype(np.float32) / 32767.0

    chunk_size = int(CHUNK_DURATION * SAMPLE_RATE)
    total_samples = len(samples)

    for start in range(0, total_samples, chunk_size):
        end = start + chunk_size
        chunk = samples[start:end].reshape(-1, 1)
        process_chunk(chunk)

    print(f"\n✅ Transcripción final guardada en: {transcription_file}")