GenXdev.AI

1.200.2025

Functions/GenXdev.AI.Queries/Start-AudioTranscription.ps1

                                ###############################################################################

<#

.SYNOPSIS

Transcribes audio to text using various input methods and advanced configuration

options.

.DESCRIPTION

This function provides comprehensive audio transcription capabilities, supporting

both real-time recording and file-based transcription. It offers extensive

configuration options for language detection, audio processing, and output

formatting.

Key features:

- Multiple audio input sources (microphone, desktop audio, wav files)

- Automatic silence detection (VOX)

- Multi-language support

- Token timestamp generation

- CPU/GPU processing optimization

- Advanced audio processing parameters

.PARAMETER ModelFilePath

Path to store model files. Defaults to local GenXdev folder.

.PARAMETER WaveFile

Path to the 16Khz mono, .WAV file to process.

.PARAMETER VOX

Use silence detection to automatically stop recording.

.PARAMETER PassThru

Returns objects instead of strings.

.PARAMETER UseDesktopAudioCapture

Whether to use desktop audio capture instead of microphone input.

.PARAMETER WithTokenTimestamps

Whether to include token timestamps in the output.

.PARAMETER TokenTimestampsSumThreshold

Sum threshold for token timestamps, defaults to 0.5.

.PARAMETER SplitOnWord

Whether to split on word boundaries.

.PARAMETER MaxTokensPerSegment

Maximum number of tokens per segment.

.PARAMETER IgnoreSilence

Whether to ignore silence (will mess up timestamps).

.PARAMETER MaxDurationOfSilence

Maximum duration of silence before automatically stopping recording.

.PARAMETER SilenceThreshold

Silence detect threshold (0..32767 defaults to 30).

.PARAMETER Language

Sets the language to detect.

.PARAMETER CpuThreads

Number of CPU threads to use, defaults to 0 (auto).

.PARAMETER Temperature

Temperature for speech generation.

.PARAMETER TemperatureInc

Temperature increment.

.PARAMETER WithTranslate

Whether to translate the output.

.PARAMETER Prompt

Prompt to use for the model.

.PARAMETER SuppressRegex

Regex to suppress tokens from the output.

.PARAMETER WithProgress

Whether to show progress.

.PARAMETER AudioContextSize

Size of the audio context.

.PARAMETER DontSuppressBlank

Whether to NOT suppress blank lines.

.PARAMETER MaxDuration

Maximum duration of the audio.

.PARAMETER Offset

Offset for the audio.

.PARAMETER MaxLastTextTokens

Maximum number of last text tokens.

.PARAMETER SingleSegmentOnly

Whether to use single segment only.

.PARAMETER PrintSpecialTokens

Whether to print special tokens.

.PARAMETER MaxSegmentLength

Maximum segment length.

.PARAMETER MaxInitialTimestamp

Start timestamps at this moment.

.PARAMETER LengthPenalty

Length penalty.

.PARAMETER EntropyThreshold

Entropy threshold.

.PARAMETER LogProbThreshold

Log probability threshold.

.PARAMETER NoSpeechThreshold

No speech threshold.

.PARAMETER NoContext

Don't use context.

.PARAMETER WithBeamSearchSamplingStrategy

Use beam search sampling strategy.

.PARAMETER Realtime

Enable real-time transcription mode.

.PARAMETER SessionOnly

Use alternative settings stored in session for AI preferences like Language,

Image collections, etc.

.PARAMETER ClearSession

Clear alternative settings stored in session for AI preferences like Language,

Image collections, etc.

.PARAMETER PreferencesDatabasePath

Database path for preference data files.

.PARAMETER SkipSession

Dont use alternative settings stored in session for AI preferences like

Language, Image collections, etc.

.EXAMPLE

Start-AudioTranscription -ModelFilePath "C:\Models" -Language "English" `

    -WithTokenTimestamps $true -PassThru $false

.EXAMPLE

transcribe -VOX -UseDesktopAudioCapture -Language "English"

#>

function Start-AudioTranscription {

    [Alias("transcribe", "recordandtranscribe")]

    [CmdletBinding(SupportsShouldProcess = $true)]

    param (

        ###########################################################################

        [Parameter(

            Mandatory = $false,

            Position = 0,

            HelpMessage = "Path where model files are stored"

        )]

        [string] $ModelFilePath,

        ###########################################################################

        [Parameter(

            Mandatory = $false,

            Position = 1,

            HelpMessage = "Path to the 16Khz mono, .WAV file to process"

        )]

        [string] $WaveFile = $null,

        ###########################################################################

        [Parameter(

            Mandatory = $false,

            HelpMessage = "Maximum duration of silence before automatically " +

                         "stopping recording"

        )]

        [object] $MaxDurationOfSilence,

        ###########################################################################

        [Parameter(

            Mandatory = $false,

            HelpMessage = "Silence detect threshold (0..32767 defaults to 30)"

        )]

        [ValidateRange(0, 32767)]

        [int] $SilenceThreshold = 30,

        ###########################################################################

        [Parameter(

            Mandatory = $false,

            HelpMessage = "Sets the language to detect"

        )]

        [ValidateSet(

            "Afrikaans",

            "Akan",

            "Albanian",

            "Amharic",

            "Arabic",

            "Armenian",

            "Azerbaijani",

            "Basque",

            "Belarusian",

            "Bemba",

            "Bengali",

            "Bihari",

            "Bork, bork, bork!",

            "Bosnian",

            "Breton",

            "Bulgarian",

            "Cambodian",

            "Catalan",

            "Cherokee",

            "Chichewa",

            "Chinese (Simplified)",

            "Chinese (Traditional)",

            "Corsican",

            "Croatian",

            "Czech",

            "Danish",

            "Dutch",

            "Elmer Fudd",

            "English",

            "Esperanto",

            "Estonian",

            "Ewe",

            "Faroese",

            "Filipino",

            "Finnish",

            "French",

            "Frisian",

            "Ga",

            "Galician",

            "Georgian",

            "German",

            "Greek",

            "Guarani",

            "Gujarati",

            "Hacker",

            "Haitian Creole",

            "Hausa",

            "Hawaiian",

            "Hebrew",

            "Hindi",

            "Hungarian",

            "Icelandic",

            "Igbo",

            "Indonesian",

            "Interlingua",

            "Irish",

            "Italian",

            "Japanese",

            "Javanese",

            "Kannada",

            "Kazakh",

            "Kinyarwanda",

            "Kirundi",

            "Klingon",

            "Kongo",

            "Korean",

            "Krio (Sierra Leone)",

            "Kurdish",

            "Kurdish (Soranî)",

            "Kyrgyz",

            "Laothian",

            "Latin",

            "Latvian",

            "Lingala",

            "Lithuanian",

            "Lozi",

            "Luganda",

            "Luo",

            "Macedonian",

            "Malagasy",

            "Malay",

            "Malayalam",

            "Maltese",

            "Maori",

            "Marathi",

            "Mauritian Creole",

            "Moldavian",

            "Mongolian",

            "Montenegrin",

            "Nepali",

            "Nigerian Pidgin",

            "Northern Sotho",

            "Norwegian",

            "Norwegian (Nynorsk)",

            "Occitan",

            "Oriya",

            "Oromo",

            "Pashto",

            "Persian",

            "Pirate",

            "Polish",

            "Portuguese (Brazil)",

            "Portuguese (Portugal)",

            "Punjabi",

            "Quechua",

            "Romanian",

            "Romansh",

            "Runyakitara",

            "Russian",

            "Scots Gaelic",

            "Serbian",

            "Serbo-Croatian",

            "Sesotho",

            "Setswana",

            "Seychellois Creole",

            "Shona",

            "Sindhi",

            "Sinhalese",

            "Slovak",

            "Slovenian",

            "Somali",

            "Spanish",

            "Spanish (Latin American)",

            "Sundanese",

            "Swahili",

            "Swedish",

            "Tajik",

            "Tamil",

            "Tatar",

            "Telugu",

            "Thai",

            "Tigrinya",

            "Tonga",

            "Tshiluba",

            "Tumbuka",

            "Turkish",

            "Turkmen",

            "Twi",

            "Uighur",

            "Ukrainian",

            "Urdu",

            "Uzbek",

            "Vietnamese",

            "Welsh",

            "Wolof",

            "Xhosa",

            "Yiddish",

            "Yoruba",

            "Zulu"

        )]

        [string] $Language,

        ###########################################################################

        [Parameter(

            Mandatory = $false,

            HelpMessage = "Number of CPU threads to use, defaults to 0 (auto)"

        )]

        [int] $CpuThreads = 0,

        ###########################################################################

        [Parameter(

            Mandatory = $false,

            HelpMessage = "Temperature for speech generation"

        )]

        [ValidateRange(0, 100)]

        [float] $Temperature = 0.01,

        ###########################################################################

        [Parameter(

            Mandatory = $false,

            HelpMessage = "Temperature increment"

        )]

        [ValidateRange(0, 1)]

        [float] $TemperatureInc,

        ###########################################################################

        [Parameter(

            Mandatory = $false,

            HelpMessage = "Prompt to use for the model"

        )]

        [string] $Prompt,

        ###########################################################################

        [Parameter(

            Mandatory = $false,

            HelpMessage = "Regex to suppress tokens from the output"

        )]

        [string] $SuppressRegex = $null,

        ###########################################################################

        [Parameter(

            Mandatory = $false,

            HelpMessage = "Size of the audio context"

        )]

        [int] $AudioContextSize,

        ###########################################################################

        [Parameter(

            Mandatory = $false,

            HelpMessage = "Maximum duration of the audio"

        )]

        [object] $MaxDuration,

        ###########################################################################

        [Parameter(

            Mandatory = $false,

            HelpMessage = "Offset for the audio"

        )]

        [object] $Offset,

        ###########################################################################

        [Parameter(

            Mandatory = $false,

            HelpMessage = "Maximum number of last text tokens"

        )]

        [int] $MaxLastTextTokens,

        ###########################################################################

        [Parameter(

            Mandatory = $false,

            HelpMessage = "Maximum segment length"

        )]

        [int] $MaxSegmentLength,

        ###########################################################################

        [Parameter(

            Mandatory = $false,

            HelpMessage = "Start timestamps at this moment"

        )]

        [object] $MaxInitialTimestamp,

        ###########################################################################

        [Parameter(

            Mandatory = $false,

            HelpMessage = "Length penalty"

        )]

        [ValidateRange(0, 1)]

        [float] $LengthPenalty,

        ###########################################################################

        [Parameter(

            Mandatory = $false,

            HelpMessage = "Entropy threshold"

        )]

        [ValidateRange(0, 1)]

        [float] $EntropyThreshold,

        ###########################################################################

        [Parameter(

            Mandatory = $false,

            HelpMessage = "Log probability threshold"

        )]

        [ValidateRange(0, 1)]

        [float] $LogProbThreshold,

        ###########################################################################

        [Parameter(

            Mandatory = $false,

            HelpMessage = "No speech threshold"

        )]

        [ValidateRange(0, 1)]

        [float] $NoSpeechThreshold,

        ###########################################################################

        [Parameter(

            Mandatory = $false,

            HelpMessage = "Sum threshold for token timestamps, defaults to 0.5"

        )]

        [float] $TokenTimestampsSumThreshold = 0.5,

        ###########################################################################

        [Parameter(

            Mandatory = $false,

            HelpMessage = "Maximum number of tokens per segment"

        )]

        [int] $MaxTokensPerSegment,

        ###########################################################################

        [Parameter(

            Mandatory = $false,

            HelpMessage = "Database path for preference data files"

        )]

        [string] $PreferencesDatabasePath,

        ###########################################################################

        [Parameter(

            Mandatory = $false,

            HelpMessage = "Use silence detection to automatically stop recording."

        )]

        [switch] $VOX,

        ###########################################################################

        [Parameter(

            Mandatory = $false,

            HelpMessage = "Returns objects instead of strings"

        )]

        [switch] $PassThru,

        ###########################################################################

        [Parameter(

            Mandatory = $false,

            HelpMessage = "Whether to use desktop audio capture instead of " +

                         "microphone input"

        )]

        [switch] $UseDesktopAudioCapture,

        ###########################################################################

        [Parameter(

            Mandatory = $false,

            HelpMessage = "Whether to include token timestamps in the output"

        )]

        [switch] $WithTokenTimestamps,

        ###########################################################################

        [Parameter(

            Mandatory = $false,

            HelpMessage = "Whether to split on word boundaries"

        )]

        [switch] $SplitOnWord,

        ###########################################################################

        [Parameter(

            Mandatory = $false,

            HelpMessage = "Whether to ignore silence (will mess up timestamps)"

        )]

        [switch] $IgnoreSilence,

        ###########################################################################

        [Parameter(

            Mandatory = $false,

            HelpMessage = "Whether to translate the output"

        )]

        [switch] $WithTranslate,

        ###########################################################################

        [Parameter(

            Mandatory = $false,

            HelpMessage = "Whether to show progress"

        )]

        [switch] $WithProgress,

        ###########################################################################

        [Parameter(

            Mandatory = $false,

            HelpMessage = "Whether to NOT suppress blank lines"

        )]

        [switch] $DontSuppressBlank,

        ###########################################################################

        [Parameter(

            Mandatory = $false,

            HelpMessage = "Whether to use single segment only"

        )]

        [switch] $SingleSegmentOnly,

        ###########################################################################

        [Parameter(

            Mandatory = $false,

            HelpMessage = "Whether to print special tokens"

        )]

        [switch] $PrintSpecialTokens,

        ###########################################################################

        [Parameter(

            Mandatory = $false,

            HelpMessage = "Don't use context"

        )]

        [switch] $NoContext,

        ###########################################################################

        [Parameter(

            Mandatory = $false,

            HelpMessage = "Use beam search sampling strategy"

        )]

        [switch] $WithBeamSearchSamplingStrategy,

        ###########################################################################

        [Parameter(

            Mandatory = $false,

            HelpMessage = "Enable real-time transcription mode"

        )]

        [switch] $Realtime,

        ###########################################################################

        [Parameter(

            Mandatory = $false,

            HelpMessage = "Use alternative settings stored in session for AI " +

                         "preferences like Language, Image collections, etc"

        )]

        [switch] $SessionOnly,

        ###########################################################################

        [Parameter(

            Mandatory = $false,

            HelpMessage = "Clear alternative settings stored in session for AI " +

                         "preferences like Language, Image collections, etc"

        )]

        [switch] $ClearSession,

        ###########################################################################

        [Parameter(

            Mandatory = $false,

            HelpMessage = "Dont use alternative settings stored in session for " +

                         "AI preferences like Language, Image collections, etc"

        )]

        [Alias("FromPreferences")]

        [switch] $SkipSession

        ###########################################################################

    )

    begin {

        # copy identical parameter values for meta language function

        $params = GenXdev.Helpers\Copy-IdenticalParamValues `

            -BoundParameters $PSBoundParameters `

            -FunctionName "GenXdev.AI\Get-AIMetaLanguage" `

            -DefaultValues (Microsoft.PowerShell.Utility\Get-Variable `

                -Scope Local `

                -ErrorAction SilentlyContinue)

        # get ai meta language setting or use default web language

        $Language = GenXdev.AI\Get-AIMetaLanguage @params

        # output initialization message for verbose logging

        Microsoft.PowerShell.Utility\Write-Verbose (

            "Initializing audio transcription with selected options"

        )

        # convert max duration of silence to timespan if needed

        if ($PSBoundParameters.ContainsKey("MaxDurationOfSilence") -and

            (-not ($MaxDurationOfSilence -is [System.TimeSpan]))) {

            $MaxDurationOfSilence = [System.TimeSpan]::FromSeconds(

                $MaxDurationOfSilence

            )

            $PSBoundParameters["MaxDurationOfSilence"] = $MaxDurationOfSilence

        }

        # convert max duration to timespan if needed

        if ($PSBoundParameters.ContainsKey("MaxDuration") -and

            (-not ($MaxDuration -is [System.TimeSpan]))) {

            $MaxDuration = [System.TimeSpan]::FromSeconds($MaxDuration)

            $PSBoundParameters["MaxDuration"] = $MaxDuration

        }

        # convert offset to timespan if needed

        if ($PSBoundParameters.ContainsKey("Offset") -and

            (-not ($Offset -is [System.TimeSpan]))) {

            $Offset = [System.TimeSpan]::FromSeconds($Offset)

            $PSBoundParameters["Offset"] = $Offset

        }

        # convert max initial timestamp to timespan if needed

        if ($PSBoundParameters.ContainsKey("MaxInitialTimestamp") -and

            (-not ($MaxInitialTimestamp -is [System.TimeSpan]))) {

            $MaxInitialTimestamp = [System.TimeSpan]::FromSeconds(

                $MaxInitialTimestamp

            )

            $PSBoundParameters["MaxInitialTimestamp"] = $MaxInitialTimestamp

        }

    }

process {

        # create default model file path if not provided or invalid

        if ([string]::IsNullOrWhiteSpace($ModelFilePath) -or

            (-not ([IO.Directory]::Exists($ModelFilePath)))) {

            $ModelFilePath = GenXdev.FileSystem\Expand-Path (

                "$($Env:LOCALAPPDATA)\GenXdev.PowerShell\"

            ) -CreateDirectory

        }

        # output model path information for verbose logging

        Microsoft.PowerShell.Utility\Write-Verbose "Using model path: $ModelFilePath"

        # add or update model path parameter in bound parameters

        if (-not $PSBoundParameters.ContainsKey("ModelFilePath")) {

            $null = $PSBoundParameters.Add("ModelFilePath", $ModelFilePath)

        }

        else {

            $PSBoundParameters["ModelFilePath"] = $ModelFilePath

        }

        # configure voice activation detection (VOX) settings

        if ($VOX -eq $true) {

            # output vox configuration message for verbose logging

            Microsoft.PowerShell.Utility\Write-Verbose "Configuring VOX settings"

            # set default max duration of silence for vox

            if (-not $PSBoundParameters.ContainsKey("MaxDurationOfSilence")) {

                $null = $PSBoundParameters.Add(

                    "MaxDurationOfSilence",

                    [System.TimeSpan]::FromSeconds(4)

                )

            }

            else {

                $PSBoundParameters["MaxDurationOfSilence"] = [System.TimeSpan]::FromSeconds(4)

            }

            # enable ignore silence for vox mode

            if (-not $PSBoundParameters.ContainsKey("IgnoreSilence")) {

                $null = $PSBoundParameters.Add("IgnoreSilence", $true)

            }

            else {

                $PSBoundParameters["IgnoreSilence"] = $true

            }

            # remove vox parameter as it's processed

            if ($PSBoundParameters.ContainsKey("VOX")) {

                $null = $PSBoundParameters.Remove("VOX")

            }

        }

        # ensure error action is set to stop for proper error handling

        if (-not $PSBoundParameters.ContainsKey("ErrorAction")) {

            $null = $PSBoundParameters.Add("ErrorAction", "Stop")

        }

        # optimize for cpu when no capable gpu is present

        if (-not (GenXdev.AI\Get-HasCapableGpu)) {

            # output cpu optimization message for verbose logging

            Microsoft.PowerShell.Utility\Write-Verbose (

                "No capable GPU detected, optimizing for CPU"

            )

            # set cpu threads to number of available cores

            if (-not $PSBoundParameters.ContainsKey("CpuThreads")) {

                $null = $PSBoundParameters.Add(

                    "CpuThreads",

                    (GenXdev.AI\Get-NumberOfCpuCores)

                )

            }

        }

        # clean up null parameters from bound parameters collection

        Microsoft.PowerShell.Utility\Write-Verbose "Cleaning up null parameters"

        $PSBoundParameters.GetEnumerator() |

            Microsoft.PowerShell.Core\ForEach-Object {

            if ($null -eq $PSItem.Value -or ($PSItem.Value -eq -1)) {

                $null = $PSBoundParameters.Remove($PSItem.Key)

            }

        }

        # preserve error handling state for restoration later

        $oldErrorActionPreference = $ErrorActionPreference

        $ErrorActionPreference = "Stop"

        try {

            # output transcription preparation message for verbose logging

            Microsoft.PowerShell.Utility\Write-Verbose (

                "Preparing transcription parameters"

            )

            # determine whether to use batch or realtime transcription

            $useRealtime = $Realtime -or ([string]::IsNullOrWhiteSpace($WaveFile))

            # prepare invocation arguments matching target function parameters

            $invocationArguments = GenXdev.Helpers\Copy-IdenticalParamValues `

                -BoundParameters $PSBoundParameters `

                -FunctionName ($useRealtime ?

                    "GenXdev.Helpers\Receive-RealTimeSpeechToText" :

                    "GenXdev.Helpers\Get-SpeechToText")

            # ensure language parameter is set using web language dictionary

            if ($PSBoundParameters.ContainsKey("Language")) {

                $invocationArguments.Language = (

                    GenXdev.Helpers\Get-WebLanguageDictionary

                )[$Language]

            }

            # determine the appropriate target description based on input type

            $targetDescription = "audio transcription"

            if ($PSBoundParameters.ContainsKey("WaveFile") -and

                (-not [string]::IsNullOrWhiteSpace($WaveFile))) {

                $targetDescription = "transcription of file '$WaveFile'"

                $useRealtime = $false

            }

            elseif ($PSBoundParameters.ContainsKey("UseDesktopAudioCapture") -and

                    $UseDesktopAudioCapture) {

                $targetDescription = "desktop audio transcription"

            }

            else {

                $targetDescription = "microphone audio transcription"

            }

            # output speech to text conversion start message for verbose logging

            Microsoft.PowerShell.Utility\Write-Verbose (

                "Starting speech to text conversion using " +

                "$($useRealtime ? 'realtime' : 'batch') processing"

            )

            # add shouldprocess check before executing the operation

            if ($PSCmdlet.ShouldProcess($targetDescription, "Start")) {

                if ($useRealtime) {

                    GenXdev.Helpers\Receive-RealTimeSpeechToText @invocationArguments

                }

                else {

                    GenXdev.Helpers\Get-SpeechToText @invocationArguments

                }

            }

        }

        finally {

            # restore original error action preference

            $ErrorActionPreference = $oldErrorActionPreference

        }

    }

    end {

    }

}

###############################################################################