Chapter 1: Voice-to-Action - Speech Recognition and Command Processing

1.1 Introduction to Voice-Enabled Robotics

Voice interfaces represent a critical frontier in human-robot interaction by enabling intuitive, natural communication. This chapter explores how robots can listen, understand, and act on spoken commands using OpenAI's Whisper model and language processing frameworks.

Why Voice Commands Matter:

Accessibility: Non-technical users can control robots naturally
Real-time Interaction: Faster than manual control or GUI interfaces
Hands-free Operation: Essential for collaborative robotics scenarios
Cognitive Load Reduction: Users think in natural language, not robot commands

1.2 OpenAI Whisper: Speech-to-Text Foundation

1.2.1 Architecture and Capabilities

OpenAI Whisper is a robust, multilingual automatic speech recognition (ASR) system trained on 680,000 hours of multilingual and multitask supervised data collected from the web.

Model Variants:

Model	Parameters	English-Only Speed	Multilingual Speed	Typical Use Case
Tiny	39M	~32x faster	~16x faster	Edge devices, Jetson
Base	74M	~16x faster	~8x faster	Real-time, onboard
Small	244M	~6x faster	~3x faster	High accuracy, fast
Medium	769M	~2x faster	~1.6x faster	Production standard
Large	1.5B	Real-time	Real-time	Maximum accuracy

1.2.2 Acoustic Modeling

Audio Waveform
     ↓
┌─────────────────────┐
│ Log-Mel Spectrogram │ (80 mel-frequency bins)
└─────────────────────┘
     ↓
┌─────────────────────┐
│ Encoder (Transformer) │ → Audio embeddings
└─────────────────────┘
     ↓
┌─────────────────────┐
│ Decoder (Transformer) │ → Text tokens
└─────────────────────┘
     ↓
Transcribed Text

1.2.3 Implementation on Robots

# OpenAI Whisper Integration on Humanoid Robot
import openai
import numpy as np
import pyaudio
import rclpy
from rclpy.node import Node

class VoiceCommandNode(Node):
    def __init__(self):
        super().__init__('voice_command_node')
        
        # Initialize Whisper
        self.model = openai.api_resources.Model.retrieve("whisper-1")
        
        # Audio recording parameters
        self.CHUNK = 2048
        self.FORMAT = pyaudio.paFloat32
        self.CHANNELS = 1
        self.RATE = 16000  # 16kHz optimal for Whisper
        self.RECORD_SECONDS = 5
        
        # Initialize PyAudio
        self.p = pyaudio.PyAudio()
        
        # Create publisher for recognized commands
        self.command_publisher = self.create_publisher(
            String,
            '/robot/voice_command',
            10
        )
        
    def record_audio(self):
        """Record audio from microphone"""
        stream = self.p.open(
            format=self.FORMAT,
            channels=self.CHANNELS,
            rate=self.RATE,
            input=True,
            frames_per_buffer=self.CHUNK
        )
        
        frames = []
        for _ in range(0, int(self.RATE / self.CHUNK * self.RECORD_SECONDS)):
            data = stream.read(self.CHUNK)
            frames.append(data)
        
        stream.stop_stream()
        stream.close()
        
        return b''.join(frames)
    
    def transcribe_with_whisper(self, audio_data):
        """
        Transcribe audio using Whisper
        
        Args:
            audio_data: Raw audio bytes
            
        Returns:
            transcribed_text: String containing recognized speech
        """
        # Convert audio to wav format
        import io
        import wave
        
        wav_buffer = io.BytesIO()
        with wave.open(wav_buffer, 'wb') as wav_file:
            wav_file.setnchannels(self.CHANNELS)
            wav_file.setsampwidth(self.p.get_sample_size(self.FORMAT))
            wav_file.setframerate(self.RATE)
            wav_file.writeframes(audio_data)
        
        wav_buffer.seek(0)
        
        # Call OpenAI Whisper API
        transcript = openai.Audio.transcribe(
            model="whisper-1",
            file=wav_buffer,
            language="en"  # Specify language for better accuracy
        )
        
        return transcript['text']
    
    def listen_and_transcribe(self):
        """Main loop for voice command listening"""
        self.get_logger().info("🎤 Listening for voice commands...")
        
        while rclpy.ok():
            try:
                # Record audio
                audio_data = self.record_audio()
                
                # Transcribe using Whisper
                command_text = self.transcribe_with_whisper(audio_data)
                
                self.get_logger().info(f"📝 Recognized: {command_text}")
                
                # Publish recognized command
                msg = String()
                msg.data = command_text
                self.command_publisher.publish(msg)
                
            except Exception as e:
                self.get_logger().error(f"Error in voice recognition: {e}")

# Entry point
if __name__ == '__main__':
    rclpy.init()
    node = VoiceCommandNode()
    node.listen_and_transcribe()

1.2.4 Handling Edge Cases and Robustness

class RobustVoiceCommandProcessor:
    """Handle edge cases in voice recognition"""
    
    def __init__(self):
        self.confidence_threshold = 0.7
        self.max_retries = 3
        self.background_noise_threshold = -40  # dB
        
    def validate_audio_quality(self, audio_data):
        """Check if audio quality is sufficient"""
        # Compute RMS (Root Mean Square) for loudness
        rms = np.sqrt(np.mean(audio_data**2))
        
        if rms < 0.01:
            return False, "Audio too quiet"
        
        return True, "Audio quality OK"
    
    def handle_silence(self, audio_data):
        """Detect and handle silence"""
        threshold = np.mean(np.abs(audio_data)) * 2
        silent_frames = np.sum(np.abs(audio_data) < threshold)
        silence_ratio = silent_frames / len(audio_data)
        
        if silence_ratio > 0.8:
            return "SILENCE_DETECTED"
        
        return None
    
    def retry_with_feedback(self, voice_processor):
        """Retry recognition with user feedback"""
        for attempt in range(self.max_retries):
            try:
                transcript = voice_processor.transcribe_with_whisper()
                
                if len(transcript) > 0:
                    return transcript
                    
            except Exception as e:
                if attempt < self.max_retries - 1:
                    print(f"⚠️ Recognition failed. Retrying... ({attempt + 1}/{self.max_retries})")
                else:
                    print(f"❌ Failed after {self.max_retries} attempts")
        
        return None

1.3 Command Parsing and Validation

Once speech is transcribed, the text must be parsed to extract intent and parameters.

1.3.1 Intent Recognition

import re
from typing import Tuple, Dict

class CommandParser:
    """Parse natural language commands into robot actions"""
    
    def __init__(self):
        # Define command patterns
        self.patterns = {
            'move': r'(move|go|walk|navigate)\s+(?:to|towards)?\s+(\w+)',
            'pick': r'(pick up|grab|grasp)\s+(?:the\s+)?(\w+)',
            'place': r'(put|place|drop)\s+(?:the\s+)?(\w+)\s+(?:at|on|in)\s+(\w+)',
            'look': r'(look|see|find|search)\s+(?:for\s+)?(?:the\s+)?(\w+)',
            'stop': r'(stop|halt|pause)',
            'reset': r'(reset|restart)'
        }
    
    def parse_command(self, text: str) -> Tuple[str, Dict]:
        """
        Parse command text into action and parameters
        
        Args:
            text: Transcribed voice command
            
        Returns:
            action: Command type (e.g., 'move', 'pick')
            params: Dictionary of command parameters
        """
        text = text.lower().strip()
        
        for action, pattern in self.patterns.items():
            match = re.match(pattern, text)
            if match:
                groups = match.groups()
                
                # Extract parameters based on action
                if action == 'move':
                    return action, {'destination': groups[1]}
                
                elif action == 'pick':
                    return action, {'object': groups[1]}
                
                elif action == 'place':
                    return action, {
                        'object': groups[1],
                        'location': groups[2]
                    }
                
                elif action == 'look':
                    return action, {'target': groups[1]}
                
                elif action in ['stop', 'reset']:
                    return action, {}
        
        return 'unknown', {}
    
    def validate_command(self, action: str, params: Dict) -> bool:
        """
        Validate that command has all required parameters
        
        Args:
            action: Command type
            params: Command parameters
            
        Returns:
            True if valid, False otherwise
        """
        required_params = {
            'move': ['destination'],
            'pick': ['object'],
            'place': ['object', 'location'],
            'look': ['target'],
            'stop': [],
            'reset': []
        }
        
        if action not in required_params:
            return False
        
        for param in required_params[action]:
            if param not in params or params[param] is None:
                return False
        
        return True

1.3.2 Confidence Scoring

class ConfidenceEstimator:
    """Estimate confidence of parsed commands"""
    
    def estimate_confidence(self, command_text: str, 
                          parsed_action: str, 
                          parse_confidence: float) -> float:
        """
        Multi-factor confidence estimation
        
        Args:
            command_text: Original transcribed text
            parsed_action: Extracted action
            parse_confidence: Confidence from parser (0-1)
            
        Returns:
            Overall confidence score (0-1)
        """
        # Length-based confidence (very short = suspicious)
        length_score = min(len(command_text) / 10, 1.0)
        
        # Known action confidence
        known_actions = ['move', 'pick', 'place', 'look', 'stop', 'reset']
        action_score = 1.0 if parsed_action in known_actions else 0.3
        
        # Combine scores
        overall_confidence = (parse_confidence * 0.5 + 
                             length_score * 0.25 + 
                             action_score * 0.25)
        
        return overall_confidence

1.1 Introduction to Voice-Enabled Robotics​

1.2 OpenAI Whisper: Speech-to-Text Foundation​

1.2.1 Architecture and Capabilities​

1.2.2 Acoustic Modeling​

1.2.3 Implementation on Robots​

1.2.4 Handling Edge Cases and Robustness​

1.3 Command Parsing and Validation​

1.3.1 Intent Recognition​

1.3.2 Confidence Scoring​