Functions/GenXdev.AI/Get-MediaFileAudioTranscription.ps1
################################################################################ <# .SYNOPSIS Transcribes an audio or video file to text.. .DESCRIPTION Transcribes an audio or video file to text using the Whisper AI model .PARAMETER FilePath The file path of the audio or video file to transcribe. .PARAMETER LanguageIn The language to expect in the audio. E.g. "English", "French", "German", "Dutch" .PARAMETER LanguageOut The language to translate to. E.g. "french", "german", "dutch" .PARAMETER SRT Output in SRT format. .PARAMETER PassThru Returns objects instead of strings. .PARAMETER UseDesktopAudioCapture Whether to use desktop audio capture instead of microphone input .PARAMETER TranslateUsingLMStudioModel The LM Studio model to use for translation. .PARAMETER MaxSrtChars The maximum number of characters per line in the SRT output. .PARAMETER WithTokenTimestamps Whether to include token timestamps in the output. .PARAMETER TokenTimestampsSumThreshold Sum threshold for token timestamps, defaults to 0.5. .PARAMETER SplitOnWord Whether to split on word boundaries. .PARAMETER MaxTokensPerSegment Maximum number of tokens per segment. .PARAMETER MaxDurationOfSilence Maximum duration of silence before automatically stopping recording. .PARAMETER SilenceThreshold Silence detect threshold (0..32767 defaults to 30) .PARAMETER CpuThreads Number of CPU threads to use, defaults to 0 (auto). .PARAMETER Temperature Temperature for speech generation. .PARAMETER TemperatureInc Temperature increment. .PARAMETER SuppressRegex Regex to suppress tokens from the output. .PARAMETER WithProgress Whether to show progress. .PARAMETER AudioContextSize Size of the audio context. .PARAMETER DontSuppressBlank Whether to NOT suppress blank lines. .PARAMETER MaxDuration Maximum duration of the audio. .PARAMETER Offset Offset for the audio. .PARAMETER MaxLastTextTokens Maximum number of last text tokens. .PARAMETER SingleSegmentOnly Whether to use single segment only. .PARAMETER PrintSpecialTokens Whether to print special tokens. .PARAMETER MaxSegmentLength Maximum segment length. .PARAMETER MaxInitialTimestamp Start timestamps at this moment. .PARAMETER LengthPenalty Length penalty. .PARAMETER EntropyThreshold Entropy threshold. .PARAMETER LogProbThreshold Log probability threshold. .PARAMETER NoSpeechThreshold No speech threshold. .PARAMETER NoContext Don't use context. .PARAMETER WithBeamSearchSamplingStrategy Use beam search sampling strategy. .EXAMPLE Get-MediaFileAudioTranscription -FilePath "C:\path\to\audio.wav" -LanguageIn "English" -LanguageOut "French" -SRT #> function Get-MediaFileAudioTranscription { [CmdletBinding()] param ( ################################################################################ [Parameter( Mandatory, Position = 0, HelpMessage = "The file path of the audio or video file to transcribe." )] [string] $FilePath, ################################################################################ [Parameter( Mandatory = $false, Position = 1, HelpMessage = "The language to expect in the audio, defaults to 'English'." )] [PSDefaultValue(Value = "English")] [ValidateSet( "Afrikaans", "Akan", "Albanian", "Amharic", "Arabic", "Armenian", "Azerbaijani", "Basque", "Belarusian", "Bemba", "Bengali", "Bihari", "Bork, bork, bork!", "Bosnian", "Breton", "Bulgarian", "Cambodian", "Catalan", "Cherokee", "Chichewa", "Chinese (Simplified)", "Chinese (Traditional)", "Corsican", "Croatian", "Czech", "Danish", "Dutch", "Elmer Fudd", "English", "Esperanto", "Estonian", "Ewe", "Faroese", "Filipino", "Finnish", "French", "Frisian", "Ga", "Galician", "Georgian", "German", "Greek", "Guarani", "Gujarati", "Hacker", "Haitian Creole", "Hausa", "Hawaiian", "Hebrew", "Hindi", "Hungarian", "Icelandic", "Igbo", "Indonesian", "Interlingua", "Irish", "Italian", "Japanese", "Javanese", "Kannada", "Kazakh", "Kinyarwanda", "Kirundi", "Klingon", "Kongo", "Korean", "Krio (Sierra Leone)", "Kurdish", "Kurdish (Soranî)", "Kyrgyz", "Laothian", "Latin", "Latvian", "Lingala", "Lithuanian", "Lozi", "Luganda", "Luo", "Macedonian", "Malagasy", "Malay", "Malayalam", "Maltese", "Maori", "Marathi", "Mauritian Creole", "Moldavian", "Mongolian", "Montenegrin", "Nepali", "Nigerian Pidgin", "Northern Sotho", "Norwegian", "Norwegian (Nynorsk)", "Occitan", "Oriya", "Oromo", "Pashto", "Persian", "Pirate", "Polish", "Portuguese (Brazil)", "Portuguese (Portugal)", "Punjabi", "Quechua", "Romanian", "Romansh", "Runyakitara", "Russian", "Scots Gaelic", "Serbian", "Serbo-Croatian", "Sesotho", "Setswana", "Seychellois Creole", "Shona", "Sindhi", "Sinhalese", "Slovak", "Slovenian", "Somali", "Spanish", "Spanish (Latin American)", "Sundanese", "Swahili", "Swedish", "Tajik", "Tamil", "Tatar", "Telugu", "Thai", "Tigrinya", "Tonga", "Tshiluba", "Tumbuka", "Turkish", "Turkmen", "Twi", "Uighur", "Ukrainian", "Urdu", "Uzbek", "Vietnamese", "Welsh", "Wolof", "Xhosa", "Yiddish", "Yoruba", "Zulu")] [string] $LanguageIn = "English", ################################################################################ [Parameter( Mandatory = $false, Position = 2, HelpMessage = "Sets the language to translate to." )] [ValidateSet( "Afrikaans", "Akan", "Albanian", "Amharic", "Arabic", "Armenian", "Azerbaijani", "Basque", "Belarusian", "Bemba", "Bengali", "Bihari", "Bork, bork, bork!", "Bosnian", "Breton", "Bulgarian", "Cambodian", "Catalan", "Cherokee", "Chichewa", "Chinese (Simplified)", "Chinese (Traditional)", "Corsican", "Croatian", "Czech", "Danish", "Dutch", "Elmer Fudd", "English", "Esperanto", "Estonian", "Ewe", "Faroese", "Filipino", "Finnish", "French", "Frisian", "Ga", "Galician", "Georgian", "German", "Greek", "Guarani", "Gujarati", "Hacker", "Haitian Creole", "Hausa", "Hawaiian", "Hebrew", "Hindi", "Hungarian", "Icelandic", "Igbo", "Indonesian", "Interlingua", "Irish", "Italian", "Japanese", "Javanese", "Kannada", "Kazakh", "Kinyarwanda", "Kirundi", "Klingon", "Kongo", "Korean", "Krio (Sierra Leone)", "Kurdish", "Kurdish (Soranî)", "Kyrgyz", "Laothian", "Latin", "Latvian", "Lingala", "Lithuanian", "Lozi", "Luganda", "Luo", "Macedonian", "Malagasy", "Malay", "Malayalam", "Maltese", "Maori", "Marathi", "Mauritian Creole", "Moldavian", "Mongolian", "Montenegrin", "Nepali", "Nigerian Pidgin", "Northern Sotho", "Norwegian", "Norwegian (Nynorsk)", "Occitan", "Oriya", "Oromo", "Pashto", "Persian", "Pirate", "Polish", "Portuguese (Brazil)", "Portuguese (Portugal)", "Punjabi", "Quechua", "Romanian", "Romansh", "Runyakitara", "Russian", "Scots Gaelic", "Serbian", "Serbo-Croatian", "Sesotho", "Setswana", "Seychellois Creole", "Shona", "Sindhi", "Sinhalese", "Slovak", "Slovenian", "Somali", "Spanish", "Spanish (Latin American)", "Sundanese", "Swahili", "Swedish", "Tajik", "Tamil", "Tatar", "Telugu", "Thai", "Tigrinya", "Tonga", "Tshiluba", "Tumbuka", "Turkish", "Turkmen", "Twi", "Uighur", "Ukrainian", "Urdu", "Uzbek", "Vietnamese", "Welsh", "Wolof", "Xhosa", "Yiddish", "Yoruba", "Zulu")] [string]$LanguageOut = $null, ################################################################################ [Parameter( Mandatory = $false, HelpMessage = "The LM Studio model to use for translation." )] [string] $TranslateUsingLMStudioModel = "llama", ################################################################################ [Parameter( Mandatory = $false, HelpMessage = "Output in SRT format." )] [switch] $SRT, ################################################################################ [Parameter(Mandatory = $false, HelpMessage = "Returns objects instead of strings")] [switch] $PassThru, ################################################################################ [Parameter(Mandatory = $false, HelpMessage = "Whether to use desktop audio capture instead of microphone input")] [switch] $UseDesktopAudioCapture, ################################################################################ [Parameter(Mandatory = $false, HelpMessage = "Whether to include token timestamps in the output")] [switch] $WithTokenTimestamps, ################################################################################ [Parameter(Mandatory = $false, HelpMessage = "Sum threshold for token timestamps, defaults to 0.5")] [float] $TokenTimestampsSumThreshold = 0.5, ################################################################################ [Parameter(Mandatory = $false, HelpMessage = "Whether to split on word boundaries")] [switch] $SplitOnWord, ################################################################################ [Parameter(Mandatory = $false, HelpMessage = "Maximum number of tokens per segment")] [int] $MaxTokensPerSegment, ################################################################################ [Parameter(Mandatory = $false, HelpMessage = "Whether to ignore silence (will mess up timestamps)")] [switch] $IgnoreSilence, ################################################################################ [Parameter(Mandatory = $false, HelpMessage = "Maximum duration of silence before automatically stopping recording")] [timespan] $MaxDurationOfSilence, ################################################################################ [Parameter(Mandatory = $false, HelpMessage = "Silence detect threshold (0..32767 defaults to 30)")] [ValidateRange(0, 32767)] [int] $SilenceThreshold, ################################################################################ [Parameter(Mandatory = $false, HelpMessage = "Number of CPU threads to use, defaults to 0 (auto)")] [int] $CpuThreads = 0, ################################################################################ [Parameter(Mandatory = $false, HelpMessage = "Temperature for speech recognition")] [ValidateRange(0, 100)] [float] $Temperature = 0.01, ################################################################################ [Parameter(Mandatory = $false, HelpMessage = "Temperature increment")] [ValidateRange(0, 1)] [float] $TemperatureInc, ################################################################################ [Parameter(Mandatory = $false, HelpMessage = "Prompt to use for the model")] [string] $Prompt, ################################################################################ [Parameter(Mandatory = $false, HelpMessage = "Regex to suppress tokens from the output")] [string] $SuppressRegex = $null, ################################################################################ [Parameter(Mandatory = $false, HelpMessage = "Whether to show progress")] [switch] $WithProgress, ################################################################################ [Parameter(Mandatory = $false, HelpMessage = "Size of the audio context")] [int] $AudioContextSize, ################################################################################ [Parameter(Mandatory = $false, HelpMessage = "Whether to NOT suppress blank lines")] [switch] $DontSuppressBlank, ################################################################################ [Parameter(Mandatory = $false, HelpMessage = "Maximum duration of the audio")] [timespan] $MaxDuration, ################################################################################ [Parameter(Mandatory = $false, HelpMessage = "Offset for the audio")] [timespan] $Offset, ################################################################################ [Parameter(Mandatory = $false, HelpMessage = "Maximum number of last text tokens")] [int] $MaxLastTextTokens, ################################################################################ [Parameter(Mandatory = $false, HelpMessage = "Whether to use single segment only")] [switch] $SingleSegmentOnly, ################################################################################ [Parameter(Mandatory = $false, HelpMessage = "Whether to print special tokens")] [switch] $PrintSpecialTokens, ################################################################################ [Parameter(Mandatory = $false, HelpMessage = "Maximum segment length")] [int] $MaxSegmentLength, ################################################################################ [Parameter(Mandatory = $false, HelpMessage = "Start timestamps at this moment")] [timespan] $MaxInitialTimestamp, ################################################################################ [Parameter(Mandatory = $false, HelpMessage = "Length penalty")] [ValidateRange(0, 1)] [float] $LengthPenalty, ################################################################################ [Parameter(Mandatory = $false, HelpMessage = "Entropy threshold")] [ValidateRange(0, 1)] [float] $EntropyThreshold, ################################################################################ [Parameter(Mandatory = $false, HelpMessage = "Log probability threshold")] [ValidateRange(0, 1)] [float] $LogProbThreshold, ################################################################################ [Parameter(Mandatory = $false, HelpMessage = "No speech threshold")] [ValidateRange(0, 1)] [float] $NoSpeechThreshold, ################################################################################ [Parameter(Mandatory = $false, HelpMessage = "Don't use context")] [switch] $NoContext, ################################################################################ [Parameter(Mandatory = $false, HelpMessage = "Use beam search sampling strategy")] [switch] $WithBeamSearchSamplingStrategy ) process { $MaxSrtChars = [System.Math]::Min(200, [System.Math]::Max(20, $MaxSrtChars)) $lmsPath = (Get-ChildItem "${env:LOCALAPPDATA}\LM-Studio\lms.exe", "${env:LOCALAPPDATA}\Programs\LM Studio\lms.exe" -File -rec -ErrorAction SilentlyContinue | Select-Object -First 1).FullName function IsLMStudioInstalled { return Test-Path -Path $lmsPath -ErrorAction SilentlyContinue } # Function to check if LMStudio is running function IsLMStudioRunning { $process = Get-Process -Name "LM Studio" -ErrorAction SilentlyContinue return $null -ne $process } function IsWinGetInstalled { Import-Module "Microsoft.WinGet.Client" -ErrorAction SilentlyContinue $module = Get-Module "Microsoft.WinGet.Client" -ErrorAction SilentlyContinue if ($null -eq $module) { return $false } return $true } function InstallWinGet { Write-Verbose "Installing WinGet PowerShell client.." Install-Module "Microsoft.WinGet.Client" -Force -AllowClobber Import-Module "Microsoft.WinGet.Client" } $ffmpegPath = (Get-ChildItem "${env:LOCALAPPDATA}\Microsoft\WinGet\ffmpeg.exe" -File -rec -ErrorAction SilentlyContinue | Select-Object -First 1 | ForEach-Object FullName) function Installffmpeg { if ($null -ne $ffmpegPath) { return } if (-not (IsWinGetInstalled)) { InstallWinGet } $ffmpeg = "Gyan.FFmpeg" $ffmpegPackage = Get-WinGetPackage -Id $ffmpeg if ($null -ne $ffmpegPackage) { Write-Verbose "Installing ffmpeg.." try { Install-WinGetPackage -Id $ffmpeg -Force } catch { winget install $ffmpeg } $ffmpegPath = (Get-ChildItem "${env:LOCALAPPDATA}\Microsoft\WinGet\ffmpeg.exe" -File -rec -ErrorAction SilentlyContinue | Select-Object -First 1).FullName } } # Make sure ffmpeg is installed Installffmpeg | Out-Null # Replace these paths with your actual file paths $inputFile = Expand-Path $FilePath $outputFile = [IO.Path]::GetTempFileName() + ".wav"; # Construct and execute the ffmpeg command $job = Start-Job -ArgumentList $ffmpegPath, $inputFile, $outputFile -ScriptBlock { param($ffmpegPath, $inputFile, $outputFile) try { [System.Console]::WriteLine("Converting the file '$inputFile' to WAV format.."); # Convert the file to WAV format & $ffmpegPath -i "$inputFile" -ac 1 -ar 16000 -sample_fmt s16 "$outputFile" -loglevel quiet -y | Out-Null } finally { [System.Console]::Write("`e[1A`e[2K") } return $LASTEXITCODE } # Wait for the job to complete and check the result $job | Wait-Job | Out-Null $success = ($job | Receive-Job) -eq 0 Remove-Job -Job $job | Out-Null if (-not $success) { Write-Warning "Failed to convert the file '$inputFile' to WAV format." # Clean up the temporary file if ([IO.File]::Exists($outputFile)) { Remove-Item -Path $outputFile -Force | Out-Null } return } if (-not $PSBoundParameters.ContainsKey("Language")) { $PSBoundParameters.Add("Language", $LanguageIn) | Out-Null; } else { $PSBoundParameters["Language"] = $LanguageIn; } if ($PSBoundParameters.ContainsKey("WithTranslate")) { $PSBoundParameters.Remove("WithTranslate", $true) | Out-Null; } if (($SRT -eq $true) -and (-not $PSBoundParameters.ContainsKey("PassThru"))) { $PSBoundParameters.Add("PassThru", $true) | Out-Null; } else { if ((-not $SRT) -and $PSBoundParameters.ContainsKey("PassThru")) { $PSBoundParameters.Remove("PassThru") | Out-Null } } if ($PSBoundParameters.ContainsKey("FilePath")) { $PSBoundParameters.Remove("FilePath") | Out-Null } if ($PSBoundParameters.ContainsKey("LanguageIn")) { $PSBoundParameters.Remove("LanguageIn") | Out-Null } if ($PSBoundParameters.ContainsKey("LanguageOut")) { $PSBoundParameters.Remove("LanguageOut") | Out-Null } if ($PSBoundParameters.ContainsKey("SRT")) { $PSBoundParameters.Remove("SRT") | Out-Null } if ($PSBoundParameters.ContainsKey("TranslateUsingLMStudioModel")) { $PSBoundParameters.Remove("TranslateUsingLMStudioModel") | Out-Null } if (-not $PSBoundParameters.ContainsKey("WaveFile")) { $PSBoundParameters.Add("WaveFile", $outputFile) | Out-Null; } if (-not $PSBoundParameters.ContainsKey("ErrorAction")) { $PSBoundParameters.Add("ErrorAction", "Stop") | Out-Null; } if (-not $PSBoundParameters.ContainsKey("ModelFilePath")) { $PSBoundParameters.Add("ModelFilePath", $ModelFilePath) | Out-Null; } else { $PSBoundParameters["ModelFilePath"] = $ModelFilePath; } if ([string]::IsNullOrWhiteSpace($LanguageIn)) { $LanguageIn = "English" } if (-not $PSBoundParameters.ContainsKey("Language")) { $PSBoundParameters.Add("Language", $LanguageIn) | Out-Null; } else { $PSBoundParameters["Language"] = $LanguageIn; } if (-not (Get-HasCapableGpu)) { if (-not $PSBoundParameters.ContainsKey("CpuThreads")) { $PSBoundParameters.Add("CpuThreads", (Get-NumberOfCpuCores)) | Out-Null; } } try { # outputting in SRT format? if ($SRT) { # initialize srt counter $i = 1 Start-AudioTranscription @PSBoundParameters | ForEach-Object { $result = $PSItem; # needs translation? if (-not [string]::IsNullOrWhiteSpace($LanguageOut)) { Write-Verbose "Translating text to $LanguageOut for: `"$($result.Text)`".." try { # translate the text $result = @{ Text = (Get-TextTranslation -Text:($result.Text) -Language:$LanguageOut -Model:$TranslateUsingLMStudioModel -Instructions "Translate this partial subtitle text, into the [Language] language. ommit only the translation no yapping or chatting. return in json format like so: {`"Translation`":`"Translated text here`"}" | ConvertFrom-Json).Translation; Start = $result.Start; End = $result.End; } Write-Verbose "Text translated to: `"$($result.Text)`".." } catch { Write-Verbose "Translating text to $LanguageOut, failed: $PSItem" } } $start = $result.Start.ToString("hh\:mm\:ss\,fff", [CultureInfo]::InvariantCulture); $end = $result.end.ToString("hh\:mm\:ss\,fff", [CultureInfo]::InvariantCulture); "$i`r`n$start --> $end`r`n$($result.Text)`r`n`r`n" # increment the counter $i++ } # end of SRT format return; } # needs translation? if (-not [string]::IsNullOrWhiteSpace($LanguageOut)) { # transcribe the audio file to text $results = Start-AudioTranscription @PSBoundParameters # delegate Get-TextTranslation -Text "$results" -Language $LanguageOut -Model $TranslateUsingLMStudioModel # end of translation return; } # return the text results without translation Start-AudioTranscription @PSBoundParameters } catch { if ("$PSItem" -notlike "*aborted*") { Write-Error $PSItem } } finally { # Clean up the temporary file if ([IO.File]::Exists($outputFile)) { Remove-Item -Path $outputFile -Force } } } } |