import os
import speech_recognition as sr
from google.cloud import speech
from google.cloud import translate_v2 as translate
import openai
from gtts import gTTS
import html
from colorama import Fore, Style
from termcolor import cprint
from art import *
# Set environment variables
'GOOGLE_APPLICATION_CREDENTIALS'] = r""
os.environ[= ''
openai.api_key
# Initialize speech recognition engine
= sr.Recognizer()
r
def clean_text(text):
= html.unescape(text)
cleaned_text return cleaned_text
def transcribe_and_translate_audio(audio_content):
= speech.SpeechClient()
client
= speech.RecognitionAudio(content=audio_content)
audio = speech.RecognitionConfig(
config =speech.RecognitionConfig.AudioEncoding.LINEAR16,
encoding="he-IL", # Language code for Hebrew
language_code
)
= client.recognize(config=config, audio=audio)
response
for result in response.results:
"\nTranscript: ", Fore.GREEN)
print_color(f"{result.alternatives[0].transcript}", 'white')
cprint(
= translate_text(result.alternatives[0].transcript)
translated_text "\nTranslated text: ", Fore.GREEN)
print_color(f"{translated_text}", 'white')
cprint(
= chat_with_gpt(translated_text)
chatbot_response "\nChatbot response: ", Fore.GREEN)
print_color(f"{chatbot_response}", 'white')
cprint(
= translate_text(chatbot_response, target_language="he")
translated_response = clean_text(translated_response)
cleaned_translated_response "\nTranslated response: ", Fore.GREEN)
print_color(f"{cleaned_translated_response}", 'white')
cprint(
text_to_speech(cleaned_translated_response)
def translate_text(text, target_language="en"):
= translate.Client()
translate_client
= translate_client.translate(
result =target_language)
text, target_language
return result['translatedText']
def chat_with_gpt(message):
= openai.ChatCompletion.create(
response ="gpt-3.5-turbo", # Make sure to use the model you have access to
model=[
messages
{"role": "system",
"content": "You are a helpful assistant."
},
{"role": "user",
"content": message
}
]
)return response['choices'][0]['message']['content']
from google.cloud import texttospeech
def text_to_speech(text):
= texttospeech.TextToSpeechClient()
client
= texttospeech.SynthesisInput(text=text)
input_text
= texttospeech.VoiceSelectionParams(
voice ="he-IL",
language_code=texttospeech.SsmlVoiceGender.NEUTRAL
ssml_gender
)
= texttospeech.AudioConfig(
audio_config =texttospeech.AudioEncoding.MP3
audio_encoding
)
= client.synthesize_speech(
response input=input_text,
=voice,
voice=audio_config
audio_config
)
# Save the speech audio into a file
with open("output.mp3", "wb") as out:
out.write(response.audio_content)
# Play the audio file
"ffplay -nodisp -autoexit output.mp3 >nul 2>&1")
os.system(
# Print text in color
def print_color(text, color):
print(color + text + Style.RESET_ALL)
# Loop to listen for audio input
while True:
# Listen for input
with sr.Microphone() as source:
"\nSpeak now:", 'yellow')
cprint(= r.listen(source)
audio
# Try to recognize the audio
try:
"\nTranscribing and translating audio...", Fore.CYAN)
print_color(
transcribe_and_translate_audio(audio.get_wav_data())"\n--------------------------------------", 'blue')
cprint(
# Catch if recognition fails
except:
= "Sorry, I didn't get that!"
response_text 'red')
cprint(response_text,
text_to_speech(response_text)"\n--------------------------------------", 'blue') cprint(
Introduction
This is a Real-Time Voice Chatbot, an application that enables users, specifically students, to interact with OpenAI’s ChatGPT in real-time and in their preferred language. This project was driven by my vision to provide support for students dealing with learning challenges, reading difficulties, or those who face linguistic barriers, such as newcomers to Israel still learning Hebrew or English.
Using a combination of technologies including Google Cloud’s Speech-to-Text and Text-to-Speech services, OpenAI’s GPT-3 model, and the speech_recognition and gtts libraries in Python, I built an application that can transcribe voice inputs, translate the transcriptions into a chosen language, generate responses using ChatGPT, and translate the responses back to the user’s language. The application then converts these responses into voice outputs, allowing users to engage in a seamless and natural conversation with ChatGPT.
The tool operates through a terminal interface, with future plans to develop a more intuitive user interface for a wider audience. Watching the students adopt this tool so effortlessly and seeing its potential in bridging educational disparities has been a truly rewarding experience.