From ea364b5614e7412529dbf72064c318251c683738 Mon Sep 17 00:00:00 2001
From: josemanuel <josetertre@gmail.com>
Date: Fri, 16 May 2025 05:44:52 +0100
Subject: [PATCH] Subir archivos a "/"

---
 01_transcribir_archivo_audio.py | 175 ++++++++++++++++++++++++++++++++
 1 file changed, 175 insertions(+)
 create mode 100644 01_transcribir_archivo_audio.py

diff --git a/01_transcribir_archivo_audio.py b/01_transcribir_archivo_audio.py
new file mode 100644
index 0000000..b3d822c
--- /dev/null
+++ b/01_transcribir_archivo_audio.py
@@ -0,0 +1,175 @@
+"""
+===========================================================
+INSTALACIÓN DE PYTHON Y DEPENDENCIAS (SISTEMA WINDOWS)
+===========================================================
+
+1. INSTALACIÓN DE PYTHON:
+   - Descarga Python para Windows desde: https://www.python.org/downloads/windows/
+   - Se recomienda instalar Python 3.12 o superior.
+   - Durante la instalación, asegúrate de marcar la opción "Add Python to PATH" para que puedas ejecutar Python y pip desde la línea de comandos.
+
+2. CONFIGURACIÓN DEL ENTORNO:
+   - Abre una ventana de comandos (CMD o PowerShell).
+
+3. INSTALACIÓN DE DEPENDENCIAS:
+   Ejecuta los siguientes comandos para instalar las dependencias necesarias:
+
+   pip install sounddevice         -> Para capturar audio en tiempo real (si se usa).
+   pip install numpy               -> Para operaciones numéricas y manejo de arrays.
+   pip install openai-whisper      -> Para transcripción con el modelo Whisper.
+       * Requiere además:
+         - ffmpeg: Descárgalo desde https://ffmpeg.org/download.html#build-windows y agrega la carpeta "bin" al PATH.
+         - PyTorch: Instálalo siguiendo las instrucciones en https://pytorch.org/
+   pip install pydub               -> Para manipulación y segmentación de audio (requiere ffmpeg instalado y en el PATH).
+   pip install SpeechRecognition   -> Para usar Google Speech-to-Text.
+
+4. EJECUCIÓN DEL SCRIPT:
+   - Guarda este código en un archivo (por ejemplo, "transcriptor.py").
+   - Ejecuta el script desde la línea de comandos con:
+         python transcriptor.py
+   - Sigue las instrucciones en pantalla para ingresar el código de idioma para Google Speech-to-Text.
+
+Nota: En Windows, para capturar el audio del sistema (lo que se reproduce en el sistema), puede ser necesario configurar el dispositivo WASAPI loopback.
+===========================================================
+"""
+
+import numpy as np
+import wave
+import os
+import datetime
+import whisper
+from pydub import AudioSegment
+import speech_recognition as sr
+import sys
+
+# ================== CONFIGURACIÓN GENERAL ==================
+SAMPLE_RATE = 44100              # Frecuencia de muestreo (Hz)
+CHUNK_DURATION = 6               # Duración en segundos de cada fragmento a procesar
+MIN_VOLUME_THRESHOLD = 0.003     # Umbral de silencio (RMS) para descartar fragmentos muy silenciosos
+TARGET_LANGUAGE = None           # Idioma para Whisper (None = detección automática)
+
+# ================== SELECCIÓN INTERACTIVA DE ARCHIVO ==================
+extensiones_validas = (".mov", ".mp4", ".m4a", ".3gp", ".3g2", ".mj2")
+archivos_audio = [f for f in os.listdir() if f.lower().endswith(extensiones_validas)]
+
+if not archivos_audio:
+    print("❌ No se encontraron archivos de audio compatibles en el directorio actual.")
+    sys.exit(1)
+
+print("\nArchivos disponibles:")
+for i, archivo in enumerate(archivos_audio, 1):
+    print(f"{i}. {archivo}")
+
+while True:
+    try:
+        seleccion = int(input("\nSelecciona el número del archivo a transcribir: "))
+        if 1 <= seleccion <= len(archivos_audio):
+            audio_filename = archivos_audio[seleccion - 1]
+            break
+        else:
+            print("Número fuera de rango.")
+    except ValueError:
+        print("Entrada no válida. Usa un número.")
+
+# ================== DEFINICIÓN DEL ARCHIVO DE SALIDA ==================
+output_dir = "transcripciones"
+os.makedirs(output_dir, exist_ok=True)
+
+nombre_sugerido = os.path.splitext(audio_filename)[0] + ".txt"
+transcription_file = input(f"Nombre del archivo de transcripción [Enter = {nombre_sugerido}]: ").strip()
+if not transcription_file:
+    transcription_file = nombre_sugerido
+elif not transcription_file.lower().endswith(".txt"):
+    transcription_file += ".txt"
+
+transcription_file = os.path.join(output_dir, transcription_file)
+
+# ================== CARGA DEL MODELO WHISPER ==================
+print("\nCargando modelo Whisper...")
+model = whisper.load_model("base")  # Puedes usar 'tiny', 'small', 'medium', 'large'
+print("✅ Modelo cargado.")
+
+# ================== PROCESAMIENTO DE FRAGMENTOS ==================
+def process_chunk(chunk):
+    # Calcular RMS (nivel de energía del fragmento) y volumen máximo
+    rms = np.sqrt(np.mean(chunk**2))
+    max_volume = np.max(np.abs(chunk))
+    print(f"RMS: {rms:.5f}, Volumen máximo: {max_volume:.5f}")
+
+    if rms < MIN_VOLUME_THRESHOLD:
+        print("Fragmento silencioso. Se omite.")
+        return
+
+    # Convertir a formato int16 para guardar como WAV
+    chunk_int16 = np.int16(chunk * 32767)
+    temp_filename = "temp_audio.wav"
+
+    try:
+        with wave.open(temp_filename, 'wb') as wf:
+            channels = chunk.shape[1] if chunk.ndim > 1 else 1
+            wf.setnchannels(channels)
+            wf.setsampwidth(2)
+            wf.setframerate(SAMPLE_RATE)
+            wf.writeframes(chunk_int16.tobytes())
+    except Exception as e:
+        print("❌ Error guardando audio temporal:", e)
+        return
+
+    chunk_start_time = datetime.datetime.now()
+
+    # Transcripción con Whisper
+    try:
+        if TARGET_LANGUAGE:
+            result_whisper = model.transcribe(temp_filename, language=TARGET_LANGUAGE, fp16=False)
+        else:
+            result_whisper = model.transcribe(temp_filename, fp16=False)
+        text_whisper = result_whisper["text"].strip()
+    except Exception as e:
+        text_whisper = f"[Error Whisper: {e}]"
+
+    # Transcripción con Google Speech-to-Text
+    r = sr.Recognizer()
+    try:
+        with sr.AudioFile(temp_filename) as source:
+            audio_data = r.record(source)
+        text_google = r.recognize_google(audio_data, language=google_language)
+    except Exception as e:
+        text_google = f"[Error Google: {e}]"
+
+    # Guardar resultado con timestamp
+    timestamp = chunk_start_time.strftime("%Y-%m-%d %H:%M:%S")
+    transcript = f"[{timestamp}] Whisper: {text_whisper}\n[{timestamp}] Google: {text_google}\n"
+
+    print(transcript)
+    with open(transcription_file, "a", encoding="utf-8") as f:
+        f.write(transcript)
+
+    os.remove(temp_filename)
+
+# ================== EJECUCIÓN PRINCIPAL ==================
+if __name__ == "__main__":
+    # Preguntar por el idioma de Google STT
+    google_language = input("Código de idioma para Google (ej: es-ES, en-US) [Enter = es-ES]: ").strip()
+    if not google_language:
+        google_language = "es-ES"
+
+    print(f"\n🔊 Cargando archivo: {audio_filename}...")
+    audio_segment = AudioSegment.from_file(audio_filename)
+
+    # Reconfigurar si es necesario (mono, 44100 Hz)
+    if audio_segment.frame_rate != SAMPLE_RATE:
+        audio_segment = audio_segment.set_frame_rate(SAMPLE_RATE)
+    if audio_segment.channels != 1:
+        audio_segment = audio_segment.set_channels(1)
+
+    samples = np.array(audio_segment.get_array_of_samples()).astype(np.float32) / 32767.0
+
+    chunk_size = int(CHUNK_DURATION * SAMPLE_RATE)
+    total_samples = len(samples)
+
+    for start in range(0, total_samples, chunk_size):
+        end = start + chunk_size
+        chunk = samples[start:end].reshape(-1, 1)
+        process_chunk(chunk)
+
+    print(f"\n✅ Transcripción final guardada en: {transcription_file}")