Functions/GenXdev.AI.Queries/Start-AudioTranscription.ps1

###############################################################################
<#
.SYNOPSIS
Transcribes audio to text using various input methods and advanced configuration
options.
 
.DESCRIPTION
This function provides comprehensive audio transcription capabilities, supporting
both real-time recording and file-based transcription. It offers extensive
configuration options for language detection, audio processing, and output
formatting.
 
Key features:
- Multiple audio input sources (microphone, desktop audio, wav files)
- Automatic silence detection (VOX)
- Multi-language support
- Token timestamp generation
- CPU/GPU processing optimization
- Advanced audio processing parameters
 
.PARAMETER ModelFilePath
Path to store model files. Defaults to local GenXdev folder.
 
.PARAMETER WaveFile
Path to the 16Khz mono, .WAV file to process.
 
.PARAMETER VOX
Use silence detection to automatically stop recording.
 
.PARAMETER PassThru
Returns objects instead of strings.
 
.PARAMETER UseDesktopAudioCapture
Whether to use desktop audio capture instead of microphone input.
 
.PARAMETER WithTokenTimestamps
Whether to include token timestamps in the output.
 
.PARAMETER TokenTimestampsSumThreshold
Sum threshold for token timestamps, defaults to 0.5.
 
.PARAMETER SplitOnWord
Whether to split on word boundaries.
 
.PARAMETER MaxTokensPerSegment
Maximum number of tokens per segment.
 
.PARAMETER IgnoreSilence
Whether to ignore silence (will mess up timestamps).
 
.PARAMETER MaxDurationOfSilence
Maximum duration of silence before automatically stopping recording.
 
.PARAMETER SilenceThreshold
Silence detect threshold (0..32767 defaults to 30).
 
.PARAMETER Language
Sets the language to detect.
 
.PARAMETER CpuThreads
Number of CPU threads to use, defaults to 0 (auto).
 
.PARAMETER Temperature
Temperature for speech generation.
 
.PARAMETER TemperatureInc
Temperature increment.
 
.PARAMETER WithTranslate
Whether to translate the output.
 
.PARAMETER Prompt
Prompt to use for the model.
 
.PARAMETER SuppressRegex
Regex to suppress tokens from the output.
 
.PARAMETER WithProgress
Whether to show progress.
 
.PARAMETER AudioContextSize
Size of the audio context.
 
.PARAMETER DontSuppressBlank
Whether to NOT suppress blank lines.
 
.PARAMETER MaxDuration
Maximum duration of the audio.
 
.PARAMETER Offset
Offset for the audio.
 
.PARAMETER MaxLastTextTokens
Maximum number of last text tokens.
 
.PARAMETER SingleSegmentOnly
Whether to use single segment only.
 
.PARAMETER PrintSpecialTokens
Whether to print special tokens.
 
.PARAMETER MaxSegmentLength
Maximum segment length.
 
.PARAMETER MaxInitialTimestamp
Start timestamps at this moment.
 
.PARAMETER LengthPenalty
Length penalty.
 
.PARAMETER EntropyThreshold
Entropy threshold.
 
.PARAMETER LogProbThreshold
Log probability threshold.
 
.PARAMETER NoSpeechThreshold
No speech threshold.
 
.PARAMETER NoContext
Don't use context.
 
.PARAMETER WithBeamSearchSamplingStrategy
Use beam search sampling strategy.
 
.PARAMETER Realtime
Enable real-time transcription mode.
 
.PARAMETER SessionOnly
Use alternative settings stored in session for AI preferences like Language,
Image collections, etc.
 
.PARAMETER ClearSession
Clear alternative settings stored in session for AI preferences like Language,
Image collections, etc.
 
.PARAMETER PreferencesDatabasePath
Database path for preference data files.
 
.PARAMETER SkipSession
Dont use alternative settings stored in session for AI preferences like
Language, Image collections, etc.
 
.EXAMPLE
Start-AudioTranscription -ModelFilePath "C:\Models" -Language "English" `
    -WithTokenTimestamps $true -PassThru $false
 
.EXAMPLE
transcribe -VOX -UseDesktopAudioCapture -Language "English"
#>

function Start-AudioTranscription {

    [Alias("transcribe", "recordandtranscribe")]
    [CmdletBinding(SupportsShouldProcess = $true)]
    param (
        ###########################################################################
        [Parameter(
            Mandatory = $false,
            Position = 0,
            HelpMessage = "Path where model files are stored"
        )]
        [string] $ModelFilePath,
        ###########################################################################
        [Parameter(
            Mandatory = $false,
            Position = 1,
            HelpMessage = "Path to the 16Khz mono, .WAV file to process"
        )]
        [string] $WaveFile = $null,
        ###########################################################################
        [Parameter(
            Mandatory = $false,
            HelpMessage = "Maximum duration of silence before automatically " +
                         "stopping recording"
        )]
        [object] $MaxDurationOfSilence,
        ###########################################################################
        [Parameter(
            Mandatory = $false,
            HelpMessage = "Silence detect threshold (0..32767 defaults to 30)"
        )]
        [ValidateRange(0, 32767)]
        [int] $SilenceThreshold = 30,
        ###########################################################################
        [Parameter(
            Mandatory = $false,
            HelpMessage = "Sets the language to detect"
        )]
        [ValidateSet(
            "Afrikaans",
            "Akan",
            "Albanian",
            "Amharic",
            "Arabic",
            "Armenian",
            "Azerbaijani",
            "Basque",
            "Belarusian",
            "Bemba",
            "Bengali",
            "Bihari",
            "Bork, bork, bork!",
            "Bosnian",
            "Breton",
            "Bulgarian",
            "Cambodian",
            "Catalan",
            "Cherokee",
            "Chichewa",
            "Chinese (Simplified)",
            "Chinese (Traditional)",
            "Corsican",
            "Croatian",
            "Czech",
            "Danish",
            "Dutch",
            "Elmer Fudd",
            "English",
            "Esperanto",
            "Estonian",
            "Ewe",
            "Faroese",
            "Filipino",
            "Finnish",
            "French",
            "Frisian",
            "Ga",
            "Galician",
            "Georgian",
            "German",
            "Greek",
            "Guarani",
            "Gujarati",
            "Hacker",
            "Haitian Creole",
            "Hausa",
            "Hawaiian",
            "Hebrew",
            "Hindi",
            "Hungarian",
            "Icelandic",
            "Igbo",
            "Indonesian",
            "Interlingua",
            "Irish",
            "Italian",
            "Japanese",
            "Javanese",
            "Kannada",
            "Kazakh",
            "Kinyarwanda",
            "Kirundi",
            "Klingon",
            "Kongo",
            "Korean",
            "Krio (Sierra Leone)",
            "Kurdish",
            "Kurdish (Soranî)",
            "Kyrgyz",
            "Laothian",
            "Latin",
            "Latvian",
            "Lingala",
            "Lithuanian",
            "Lozi",
            "Luganda",
            "Luo",
            "Macedonian",
            "Malagasy",
            "Malay",
            "Malayalam",
            "Maltese",
            "Maori",
            "Marathi",
            "Mauritian Creole",
            "Moldavian",
            "Mongolian",
            "Montenegrin",
            "Nepali",
            "Nigerian Pidgin",
            "Northern Sotho",
            "Norwegian",
            "Norwegian (Nynorsk)",
            "Occitan",
            "Oriya",
            "Oromo",
            "Pashto",
            "Persian",
            "Pirate",
            "Polish",
            "Portuguese (Brazil)",
            "Portuguese (Portugal)",
            "Punjabi",
            "Quechua",
            "Romanian",
            "Romansh",
            "Runyakitara",
            "Russian",
            "Scots Gaelic",
            "Serbian",
            "Serbo-Croatian",
            "Sesotho",
            "Setswana",
            "Seychellois Creole",
            "Shona",
            "Sindhi",
            "Sinhalese",
            "Slovak",
            "Slovenian",
            "Somali",
            "Spanish",
            "Spanish (Latin American)",
            "Sundanese",
            "Swahili",
            "Swedish",
            "Tajik",
            "Tamil",
            "Tatar",
            "Telugu",
            "Thai",
            "Tigrinya",
            "Tonga",
            "Tshiluba",
            "Tumbuka",
            "Turkish",
            "Turkmen",
            "Twi",
            "Uighur",
            "Ukrainian",
            "Urdu",
            "Uzbek",
            "Vietnamese",
            "Welsh",
            "Wolof",
            "Xhosa",
            "Yiddish",
            "Yoruba",
            "Zulu"
        )]
        [string] $Language,
        ###########################################################################
        [Parameter(
            Mandatory = $false,
            HelpMessage = "Number of CPU threads to use, defaults to 0 (auto)"
        )]
        [int] $CpuThreads = 0,
        ###########################################################################
        [Parameter(
            Mandatory = $false,
            HelpMessage = "Temperature for speech generation"
        )]
        [ValidateRange(0, 100)]
        [float] $Temperature = 0.01,
        ###########################################################################
        [Parameter(
            Mandatory = $false,
            HelpMessage = "Temperature increment"
        )]
        [ValidateRange(0, 1)]
        [float] $TemperatureInc,
        ###########################################################################
        [Parameter(
            Mandatory = $false,
            HelpMessage = "Prompt to use for the model"
        )]
        [string] $Prompt,
        ###########################################################################
        [Parameter(
            Mandatory = $false,
            HelpMessage = "Regex to suppress tokens from the output"
        )]
        [string] $SuppressRegex = $null,
        ###########################################################################
        [Parameter(
            Mandatory = $false,
            HelpMessage = "Size of the audio context"
        )]
        [int] $AudioContextSize,
        ###########################################################################
        [Parameter(
            Mandatory = $false,
            HelpMessage = "Maximum duration of the audio"
        )]
        [object] $MaxDuration,
        ###########################################################################
        [Parameter(
            Mandatory = $false,
            HelpMessage = "Offset for the audio"
        )]
        [object] $Offset,
        ###########################################################################
        [Parameter(
            Mandatory = $false,
            HelpMessage = "Maximum number of last text tokens"
        )]
        [int] $MaxLastTextTokens,
        ###########################################################################
        [Parameter(
            Mandatory = $false,
            HelpMessage = "Maximum segment length"
        )]
        [int] $MaxSegmentLength,
        ###########################################################################
        [Parameter(
            Mandatory = $false,
            HelpMessage = "Start timestamps at this moment"
        )]
        [object] $MaxInitialTimestamp,
        ###########################################################################
        [Parameter(
            Mandatory = $false,
            HelpMessage = "Length penalty"
        )]
        [ValidateRange(0, 1)]
        [float] $LengthPenalty,
        ###########################################################################
        [Parameter(
            Mandatory = $false,
            HelpMessage = "Entropy threshold"
        )]
        [ValidateRange(0, 1)]
        [float] $EntropyThreshold,
        ###########################################################################
        [Parameter(
            Mandatory = $false,
            HelpMessage = "Log probability threshold"
        )]
        [ValidateRange(0, 1)]
        [float] $LogProbThreshold,
        ###########################################################################
        [Parameter(
            Mandatory = $false,
            HelpMessage = "No speech threshold"
        )]
        [ValidateRange(0, 1)]
        [float] $NoSpeechThreshold,
        ###########################################################################
        [Parameter(
            Mandatory = $false,
            HelpMessage = "Sum threshold for token timestamps, defaults to 0.5"
        )]
        [float] $TokenTimestampsSumThreshold = 0.5,
        ###########################################################################
        [Parameter(
            Mandatory = $false,
            HelpMessage = "Maximum number of tokens per segment"
        )]
        [int] $MaxTokensPerSegment,
        ###########################################################################
        [Parameter(
            Mandatory = $false,
            HelpMessage = "Database path for preference data files"
        )]
        [string] $PreferencesDatabasePath,
        ###########################################################################
        [Parameter(
            Mandatory = $false,
            HelpMessage = "Use silence detection to automatically stop recording."
        )]
        [switch] $VOX,
        ###########################################################################
        [Parameter(
            Mandatory = $false,
            HelpMessage = "Returns objects instead of strings"
        )]
        [switch] $PassThru,
        ###########################################################################
        [Parameter(
            Mandatory = $false,
            HelpMessage = "Whether to use desktop audio capture instead of " +
                         "microphone input"
        )]
        [switch] $UseDesktopAudioCapture,
        ###########################################################################
        [Parameter(
            Mandatory = $false,
            HelpMessage = "Whether to include token timestamps in the output"
        )]
        [switch] $WithTokenTimestamps,
        ###########################################################################
        [Parameter(
            Mandatory = $false,
            HelpMessage = "Whether to split on word boundaries"
        )]
        [switch] $SplitOnWord,
        ###########################################################################
        [Parameter(
            Mandatory = $false,
            HelpMessage = "Whether to ignore silence (will mess up timestamps)"
        )]
        [switch] $IgnoreSilence,
        ###########################################################################
        [Parameter(
            Mandatory = $false,
            HelpMessage = "Whether to translate the output"
        )]
        [switch] $WithTranslate,
        ###########################################################################
        [Parameter(
            Mandatory = $false,
            HelpMessage = "Whether to show progress"
        )]
        [switch] $WithProgress,
        ###########################################################################
        [Parameter(
            Mandatory = $false,
            HelpMessage = "Whether to NOT suppress blank lines"
        )]
        [switch] $DontSuppressBlank,
        ###########################################################################
        [Parameter(
            Mandatory = $false,
            HelpMessage = "Whether to use single segment only"
        )]
        [switch] $SingleSegmentOnly,
        ###########################################################################
        [Parameter(
            Mandatory = $false,
            HelpMessage = "Whether to print special tokens"
        )]
        [switch] $PrintSpecialTokens,
        ###########################################################################
        [Parameter(
            Mandatory = $false,
            HelpMessage = "Don't use context"
        )]
        [switch] $NoContext,
        ###########################################################################
        [Parameter(
            Mandatory = $false,
            HelpMessage = "Use beam search sampling strategy"
        )]
        [switch] $WithBeamSearchSamplingStrategy,
        ###########################################################################
        [Parameter(
            Mandatory = $false,
            HelpMessage = "Enable real-time transcription mode"
        )]
        [switch] $Realtime,
        ###########################################################################
        [Parameter(
            Mandatory = $false,
            HelpMessage = "Use alternative settings stored in session for AI " +
                         "preferences like Language, Image collections, etc"
        )]
        [switch] $SessionOnly,
        ###########################################################################
        [Parameter(
            Mandatory = $false,
            HelpMessage = "Clear alternative settings stored in session for AI " +
                         "preferences like Language, Image collections, etc"
        )]
        [switch] $ClearSession,
        ###########################################################################
        [Parameter(
            Mandatory = $false,
            HelpMessage = "Dont use alternative settings stored in session for " +
                         "AI preferences like Language, Image collections, etc"
        )]
        [Alias("FromPreferences")]
        [switch] $SkipSession
        ###########################################################################
    )

    begin {

        # copy identical parameter values for meta language function
        $params = GenXdev.Helpers\Copy-IdenticalParamValues `
            -BoundParameters $PSBoundParameters `
            -FunctionName "GenXdev.AI\Get-AIMetaLanguage" `
            -DefaultValues (Microsoft.PowerShell.Utility\Get-Variable `
                -Scope Local `
                -ErrorAction SilentlyContinue)

        # get ai meta language setting or use default web language
        $Language = GenXdev.AI\Get-AIMetaLanguage @params

        # output initialization message for verbose logging
        Microsoft.PowerShell.Utility\Write-Verbose (
            "Initializing audio transcription with selected options"
        )

        # convert max duration of silence to timespan if needed
        if ($PSBoundParameters.ContainsKey("MaxDurationOfSilence") -and
            (-not ($MaxDurationOfSilence -is [System.TimeSpan]))) {

            $MaxDurationOfSilence = [System.TimeSpan]::FromSeconds(
                $MaxDurationOfSilence
            )

            $PSBoundParameters["MaxDurationOfSilence"] = $MaxDurationOfSilence
        }

        # convert max duration to timespan if needed
        if ($PSBoundParameters.ContainsKey("MaxDuration") -and
            (-not ($MaxDuration -is [System.TimeSpan]))) {

            $MaxDuration = [System.TimeSpan]::FromSeconds($MaxDuration)

            $PSBoundParameters["MaxDuration"] = $MaxDuration
        }

        # convert offset to timespan if needed
        if ($PSBoundParameters.ContainsKey("Offset") -and
            (-not ($Offset -is [System.TimeSpan]))) {

            $Offset = [System.TimeSpan]::FromSeconds($Offset)

            $PSBoundParameters["Offset"] = $Offset
        }

        # convert max initial timestamp to timespan if needed
        if ($PSBoundParameters.ContainsKey("MaxInitialTimestamp") -and
            (-not ($MaxInitialTimestamp -is [System.TimeSpan]))) {

            $MaxInitialTimestamp = [System.TimeSpan]::FromSeconds(
                $MaxInitialTimestamp
            )

            $PSBoundParameters["MaxInitialTimestamp"] = $MaxInitialTimestamp
        }
    }


process {

        # create default model file path if not provided or invalid
        if ([string]::IsNullOrWhiteSpace($ModelFilePath) -or
            (-not ([IO.Directory]::Exists($ModelFilePath)))) {

            $ModelFilePath = GenXdev.FileSystem\Expand-Path (
                "$($Env:LOCALAPPDATA)\GenXdev.PowerShell\"
            ) -CreateDirectory
        }

        # output model path information for verbose logging
        Microsoft.PowerShell.Utility\Write-Verbose "Using model path: $ModelFilePath"

        # add or update model path parameter in bound parameters
        if (-not $PSBoundParameters.ContainsKey("ModelFilePath")) {

            $null = $PSBoundParameters.Add("ModelFilePath", $ModelFilePath)
        }
        else {
            $PSBoundParameters["ModelFilePath"] = $ModelFilePath
        }

        # configure voice activation detection (VOX) settings
        if ($VOX -eq $true) {

            # output vox configuration message for verbose logging
            Microsoft.PowerShell.Utility\Write-Verbose "Configuring VOX settings"

            # set default max duration of silence for vox
            if (-not $PSBoundParameters.ContainsKey("MaxDurationOfSilence")) {

                $null = $PSBoundParameters.Add(
                    "MaxDurationOfSilence",
                    [System.TimeSpan]::FromSeconds(4)
                )
            }
            else {
                $PSBoundParameters["MaxDurationOfSilence"] = [System.TimeSpan]::FromSeconds(4)
            }

            # enable ignore silence for vox mode
            if (-not $PSBoundParameters.ContainsKey("IgnoreSilence")) {

                $null = $PSBoundParameters.Add("IgnoreSilence", $true)
            }
            else {
                $PSBoundParameters["IgnoreSilence"] = $true
            }

            # remove vox parameter as it's processed
            if ($PSBoundParameters.ContainsKey("VOX")) {

                $null = $PSBoundParameters.Remove("VOX")
            }
        }

        # ensure error action is set to stop for proper error handling
        if (-not $PSBoundParameters.ContainsKey("ErrorAction")) {

            $null = $PSBoundParameters.Add("ErrorAction", "Stop")
        }

        # optimize for cpu when no capable gpu is present
        if (-not (GenXdev.AI\Get-HasCapableGpu)) {

            # output cpu optimization message for verbose logging
            Microsoft.PowerShell.Utility\Write-Verbose (
                "No capable GPU detected, optimizing for CPU"
            )

            # set cpu threads to number of available cores
            if (-not $PSBoundParameters.ContainsKey("CpuThreads")) {

                $null = $PSBoundParameters.Add(
                    "CpuThreads",
                    (GenXdev.AI\Get-NumberOfCpuCores)
                )
            }
        }

        # clean up null parameters from bound parameters collection
        Microsoft.PowerShell.Utility\Write-Verbose "Cleaning up null parameters"

        $PSBoundParameters.GetEnumerator() |
            Microsoft.PowerShell.Core\ForEach-Object {

            if ($null -eq $PSItem.Value -or ($PSItem.Value -eq -1)) {

                $null = $PSBoundParameters.Remove($PSItem.Key)
            }
        }

        # preserve error handling state for restoration later
        $oldErrorActionPreference = $ErrorActionPreference

        $ErrorActionPreference = "Stop"

        try {

            # output transcription preparation message for verbose logging
            Microsoft.PowerShell.Utility\Write-Verbose (
                "Preparing transcription parameters"
            )

            # determine whether to use batch or realtime transcription
            $useRealtime = $Realtime -or ([string]::IsNullOrWhiteSpace($WaveFile))

            # prepare invocation arguments matching target function parameters
            $invocationArguments = GenXdev.Helpers\Copy-IdenticalParamValues `
                -BoundParameters $PSBoundParameters `
                -FunctionName ($useRealtime ?
                    "GenXdev.Helpers\Receive-RealTimeSpeechToText" :
                    "GenXdev.Helpers\Get-SpeechToText")

            # ensure language parameter is set using web language dictionary
            if ($PSBoundParameters.ContainsKey("Language")) {

                $invocationArguments.Language = (
                    GenXdev.Helpers\Get-WebLanguageDictionary
                )[$Language]
            }

            # determine the appropriate target description based on input type
            $targetDescription = "audio transcription"

            if ($PSBoundParameters.ContainsKey("WaveFile") -and
                (-not [string]::IsNullOrWhiteSpace($WaveFile))) {

                $targetDescription = "transcription of file '$WaveFile'"

                $useRealtime = $false
            }
            elseif ($PSBoundParameters.ContainsKey("UseDesktopAudioCapture") -and
                    $UseDesktopAudioCapture) {

                $targetDescription = "desktop audio transcription"
            }
            else {
                $targetDescription = "microphone audio transcription"
            }

            # output speech to text conversion start message for verbose logging
            Microsoft.PowerShell.Utility\Write-Verbose (
                "Starting speech to text conversion using " +
                "$($useRealtime ? 'realtime' : 'batch') processing"
            )

            # add shouldprocess check before executing the operation
            if ($PSCmdlet.ShouldProcess($targetDescription, "Start")) {

                if ($useRealtime) {

                    GenXdev.Helpers\Receive-RealTimeSpeechToText @invocationArguments
                }
                else {
                    GenXdev.Helpers\Get-SpeechToText @invocationArguments
                }
            }
        }
        finally {
            # restore original error action preference
            $ErrorActionPreference = $oldErrorActionPreference
        }
    }

    end {
    }
}
###############################################################################