#Region '.\Private\Compile-LanguageNightlyBuild.ps1' 0 function Compile-LanguageNightlyBuild{ param($languageCode) $sqlCPUUsage = (Get-Counter '\Process(sqlservr)\% Processor Time').CounterSamples.CookedValue if ($sqlCPUUsage -gt 30) { # over 30% CPU usage => possibly a zombie SQL process. Restart LaMP, clean up the cache Stop-Service "Tisane LaMP *LaMP*" Start-Service "Tisane LaMP *LaMP*" Invoke-Sqlcmd -Query "USE tempdb; DBCC FREEPROCCACHE; DBCC DROPCLEANBUFFERS; DBCC FREESYSTEMCACHE ('ALL'); DBCC FREESESSIONCACHE" } C:\Tisane\tisaneCompiler.exe "$languageCode" if ($LastExitCode -ne 0) { # it crashed. Restart LaMP, then try again Stop-Service "Tisane LaMP *LaMP*" Start-Service "Tisane LaMP *LaMP*" Invoke-Sqlcmd -Query "USE tempdb; DBCC FREEPROCCACHE; DBCC DROPCLEANBUFFERS; DBCC FREESYSTEMCACHE ('ALL'); DBCC FREESESSIONCACHE" C:\Tisane\tisaneCompiler.exe "$languageCode" } $compressAndUpload = { param($languageCode) $languageCode = $languageCode.Replace('-','_') $zipFullFilename = "C:\Tisane\tisaneDB$languageCode.zip" If (Test-Path $zipFullFilename){ Remove-Item $zipFullFilename } Compress-Archive -Path "C:\Tisane\outdb\$languageCode-*" -DestinationPath $zipFullFilename Compress-Archive -Path "C:\Tisane\outdb\family" -Update -DestinationPath $zipFullFilename Compress-Archive -Path "C:\Tisane\outdb\pragma" -Update -DestinationPath $zipFullFilename Compress-Archive -Path "C:\Tisane\outdb\role" -Update -DestinationPath $zipFullFilename #$ftpTarget = "ftp://internal%40tisane.ai:Tisanelabs4now4321@ftp.tisane.ai/nightly_tisane_$languageCode.zip" # $webclient = New-Object -TypeName System.Net.WebClient # $uri = New-Object -TypeName System.Uri -ArgumentList $ftpTarget # $webclient.UploadFile($uri, $zipFullFilename) } #Start-Job -ScriptBlock $compressAndUpload -ArgumentList $languageCode } #EndRegion '.\Private\Compile-LanguageNightlyBuild.ps1' 37 #Region '.\Private\Compile-SpellChecking.ps1' 0 function Compile-SpellChecking { If (Test-Path "C:\Tisane\TisaneLaMP.log"){ Remove-Item "C:\Tisane\TisaneLaMP.log" } "Compiling" # from the heaviest to the lightest CompileLanguage -languageCode ps-AF CompileLanguage -languageCode ar CompileLanguage -languageCode de CompileLanguage -languageCode tr CompileLanguage -languageCode vi CompileLanguage -languageCode es CompileLanguage -languageCode fa CompileLanguage -languageCode ru CompileLanguage -languageCode fi CompileLanguage -languageCode da CompileLanguage -languageCode ur CompileLanguage -languageCode hi CompileLanguage -languageCode id CompileLanguage -languageCode it CompileLanguage -languageCode ko CompileLanguage -languageCode ms CompileLanguage -languageCode nl CompileLanguage -languageCode no CompileLanguage -languageCode pl CompileLanguage -languageCode pt CompileLanguage -languageCode fr CompileLanguage -languageCode sv CompileLanguage -languageCode ta CompileLanguage -languageCode he CompileLanguage -languageCode en } #EndRegion '.\Private\Compile-SpellChecking.ps1' 36 #Region '.\Private\CompileLanguage.ps1' 0 function CompileLanguage{ param($languageCode) C:\Tisane\tisaneCompiler.exe "$languageCode" spell } #EndRegion '.\Private\CompileLanguage.ps1' 5 #Region '.\Public\Bulk-Normalize.ps1' 0 ## ============================================================================= ## ## This script's purpose is to set features from Wiktionary. MUST BE RUN ON THE LAMP SERVER ## ## ============================================================================= function Bulk-Normalize{ [CmdletBinding()] Param( [Parameter(Mandatory = $true, HelpMessage="Language code: ")][String] $language, [Parameter(Mandatory = $true, HelpMessage="Filter: ")][String] $filter, [Parameter(Mandatory = $false, HelpMessage="Non-MWE mode: ")][boolean] $nonMWE ) # . ".\normalizationLib.ps1" $config_path = $path + "Tisane.TestConsole.exe.config" [System.AppDomain]::CurrentDomain.SetData("APP_CONFIG_FILE", $config_path) Add-Type -AssemblyName System.Configuration [Configuration.ConfigurationManager].GetField("s_initState", "NonPublic, Static").SetValue($null, 0) [Configuration.ConfigurationManager].GetField("s_configSystem", "NonPublic, Static").SetValue($null, $null) ([Configuration.ConfigurationManager].Assembly.GetTypes() | where {$_.FullName -eq "System.Configuration.ClientConfigPaths"}).GetField("s_current", "NonPublic, Static").SetValue($null, $null) [Configuration.ConfigurationManager]::ConnectionStrings[0].Name [Reflection.Assembly]::LoadFrom($path + "Tisane.Runtime.dll") # $OutputEncoding = [console]::InputEncoding = [console]::OutputEncoding = New-Object System.Text.UTF8Encoding Login-Lamp # $whatever = Invoke-WebRequest -Uri "$productionHost/setLanguage?language=$languageID" -Method POST -Headers $global:authorizationToken -Body ' ' Set-LampLanguage -languageId $languageId if ($nonMWE) { $mweSQL = '' } else { $mweSQL = "AND MainLemma LIKE '% %'" } $sql = "USE tisane; SELECT Id, MainLemma FROM Lexemes l WHERE LanguageId = $languageID AND (ISNULL(l.LastUpdatedByBatch,'') <> 'bulkNormalize.ps1') AND InflectingSegment IS NULL $mweSQL AND ($filter)" $lexemes = Invoke-Sqlcmd -Query $sql $lexemes | ForEach-Object { $mainLemma = $_.MainLemma $lexemeId = $_.Id if ($nonMWE) { $updated = GetNormalizedWikidataLemma -language $language -entry $mainLemma if ($updated) { $updateSQL = "MainLemma = N'$updated'" } else { $updateSQL = "" } } else { $updateSQL = GetSegmentSQL -language $language -entry $mainLemma } if ($updateSQL) { Write-Host "[$lexemeId] $mainLemma UPDATE: $updateSQL" -ForegroundColor Green $sql = "USE tisane; UPDATE Lexemes SET $updateSQL, LastUpdatedByBatch = 'bulkNormalize.ps1', LastBatchUpdate = GETUTCDATE() WHERE Id = $lexemeId; DELETE dbo.Features WHERE ConnectionType = 1 AND EntityId = $lexemeId" #$sql Invoke-Sqlcmd -Query $sql $response = Invoke-RestMethod -Uri "$global:productionHost/lexeme?id=$lexemeId" -Method GET -Headers $global:authorizationToken $grammar = @() if ($updated) { $mainLemma = $updated } $lexeme = @{ id=$lexemeId lemma=$mainLemma grammar=$grammar } $grammar = $response.grammar # existing grammar; cleared before if needed #$grammar += @([Feature]::new($featureIndex, $translatedTag, "Grammar")) $lexeme.grammar = $grammar $lexemeJson = ConvertTo-Json -InputObject $lexeme #"Tagging: " + $lexemeJson $taggedLemma = Invoke-RestMethod -Uri "$global:productionHost/tagLemma" -Method POST -Headers $global:authorizationToken -Body ([System.Text.Encoding]::UTF8.GetBytes($lexemeJson)) $lexeme.grammar = $taggedLemma.grammar $lexeme.stem = $taggedLemma.stem if ($taggedLemma.style) { $lexeme.style = $taggedLemma.style } $lexeme.requestId = $response.requestId # need for the update request $lexemeJson = ConvertTo-Json -InputObject $lexeme $response = Invoke-RestMethod -Uri "$global:productionHost/lexeme" -Method PUT -Headers $global:authorizationToken -Body ([System.Text.Encoding]::UTF8.GetBytes($lexemeJson)) } } } #EndRegion '.\Public\Bulk-Normalize.ps1' 91 #Region '.\Public\Bulk-SameInflectionAs.ps1' 0 ## ============================================================================= ## ## This script's purpose is to set same inflection as. MUST BE RUN ON THE LAMP SERVER ## ## ============================================================================= function Bulk-SameInflectionAs{ [CmdletBinding()] Param( [Parameter(Mandatory = $true, HelpMessage="Language code: ")][String] $language, [Parameter(Mandatory = $true, HelpMessage="Filter: ")][String] $filter ) # . ".\normalizationLib.ps1" $config_path = $path + "Tisane.TestConsole.exe.config" [System.AppDomain]::CurrentDomain.SetData("APP_CONFIG_FILE", $config_path) Add-Type -AssemblyName System.Configuration [Configuration.ConfigurationManager].GetField("s_initState", "NonPublic, Static").SetValue($null, 0) [Configuration.ConfigurationManager].GetField("s_configSystem", "NonPublic, Static").SetValue($null, $null) ([Configuration.ConfigurationManager].Assembly.GetTypes() | where {$_.FullName -eq "System.Configuration.ClientConfigPaths"}).GetField("s_current", "NonPublic, Static").SetValue($null, $null) [Configuration.ConfigurationManager]::ConnectionStrings[0].Name [Reflection.Assembly]::LoadFrom($path + "Tisane.Runtime.dll") # $OutputEncoding = [console]::InputEncoding = [console]::OutputEncoding = New-Object System.Text.UTF8Encoding Login-Lamp # $whatever = Invoke-WebRequest -Uri "$productionHost/setLanguage?language=$languageID" -Method POST -Headers $authorizationToken -Body ' ' Set-LampLanguage -languageId $languageId $mweSQL = "AND MainLemma LIKE '% %'" $sql = "USE tisane; SELECT Id, MainLemma FROM Lexemes l WHERE LanguageId = $languageID AND (ISNULL(l.LastUpdatedByBatch,'') NOT LIKE 'bulk%') AND InflectingSegment IS NULL AND SameInflectionsAs IS NULL $mweSQL AND ($filter)" $lexemes = Invoke-Sqlcmd -Query $sql $lexemes | ForEach-Object { $mainLemma = $_.MainLemma $targetLexemeId = $_.Id $lexemeId, $stem = GetVerbSameInflectionAs -languageId $languageID -entry $mainLemma if ($lexemeId -gt 0) { $updateSQL = "SameInflectionsAs = $lexemeId, Stem = N'$stem'" Write-Host "$mainLemma UPDATE: $updateSQL" -ForegroundColor Green $sql = "USE tisane; UPDATE Lexemes SET $updateSQL, LastUpdatedByBatch = 'bulkSameInflectionAs.ps1', LastBatchUpdate = GETUTCDATE() WHERE Id = $targetLexemeId; DELETE dbo.Features WHERE ConnectionType = 1 AND EntityId = $lexemeId" #$sql $res = Invoke-Sqlcmd -Query $sql } } } #EndRegion '.\Public\Bulk-SameInflectionAs.ps1' 49 #Region '.\Public\Generate-MWEVerbalPhrases.ps1' 0 function Generate-MWEVerbalPhrases{ [CmdletBinding()] Param( [Parameter(Mandatory = $true, HelpMessage="Filename")][String] $pathname, [Parameter(Mandatory = $true, HelpMessage="Language ID")][String] $languageId, [Parameter(Mandatory = $true, HelpMessage="0=verb is first; 1=verb is last")][Boolean] $swap ) #Login-Lamp $fileLines = Get-Content $pathname $i = 0 $fileLines | ForEach-Object{ $phrase = $_ $phrase=$phrase.Split(" ") # extracting verb and rest depending on swap parameter if ($swap) { $verb = $phrase[-1] $rest = $phrase[0..($phrase.count-2)] -join ', ' } else { $verb,$rest= $phrase $rest=$rest -join ' ' } $pct = $i / $fileLines.length * 100 Write-Progress -Activity "Generating verbal phrases from MWEs" -Status "$pct% $phrase" -PercentComplete $pct $results = Invoke-Sqlcmd -Query "USE tisane; EXEC dbo.GenerateMWEVerbalPhrases $languageId, N'$verb', N'$rest', $swap" $i += 1 } } #EndRegion '.\Public\Generate-MWEVerbalPhrases.ps1' 35 #Region '.\Public\Import-FamiliesByWikidataCategory.ps1' 0 ## ============================================================================= ## ## This script's purpose is to import new entries from Wikidata for all the supported languages ## ## ============================================================================= function Import-FamiliesByWikidataCategory{ [CmdletBinding()] Param( # [Parameter(Mandatory = $true, valueFromPipeline=$true, HelpMessage="LaMP user: ")][String] $user, # [Parameter(Mandatory = $true, HelpMessage="LaMP password: ")][String] $password, [Parameter(Mandatory = $false, HelpMessage="Language: ")][String] $language ) # SELECT fa.Definition, fa.Id, other.Id OtherId, other.Definition otherDefinition FROM dbo.Families fa INNER JOIN dbo.Families other ON other.WikidataId = fa.WikidataId AND other.Id > fa.Id WHERE fa.WikidataId IS NOT NULL ## duplicates ## 60000 was where it ended $global:maxEntries = 0 $ttr = New-Object System.Diagnostics.TextWriterTraceListener("C:\Tisane\wikidata.log") [System.Diagnostics.Trace]::Listeners.Add($ttr) [System.Diagnostics.Trace]::AutoFlush = $true ## overcome the HTTPS error: https://stackoverflow.com/questions/11696944/powershell-v3-invoke-webrequest-https-error # If (-not ("TrustAllCertsPolicy" -as [type])) { # Add-Type @" # using System.Net; # using System.Security.Cryptography.X509Certificates; # public class TrustAllCertsPolicy : ICertificatePolicy { # public bool CheckValidationResult( # ServicePoint srvPoint, X509Certificate certificate, # WebRequest request, int certificateProblem) { # return true; # } # } # "@ # } # [System.Net.ServicePointManager]::CertificatePolicy = New-Object TrustAllCertsPolicy Login-Lamp class LexiconEntry { [string]$word [int]$familyId [string]$wikidata LexiconEntry([string]$word, [int]$familyId, [string]$wikidata) { $this.word = $word $this.familyId = $familyId $this.wikidata = $wikidata } } class ImportedLanguage { [string]$ietf [int]$tisaneId $entries AddWord([string]$word, [int]$familyId, [string]$wikidata, [boolean]$proper) { if (-not $proper -and -not ($this.ietf -eq 'de')) { if (-not $word.ToLower().Equals($word) -and $word.Substring(1).Equals($word.Substring(1).ToLower())) { # uncapitalize $word = $word.ToLower() } } $toRemovePos = $word.IndexOf('(') if ($toRemovePos -gt 0) { $word = $word.Substring(0, $toRemovePos).Trim() } #Write-Host "Adding $word (family $familyId)" $this.entries += @([LexiconEntry]::new($word, $familyId, $wikidata)) if ($this.entries.length -gt $global:maxEntries) { $global:maxEntries = $this.entries.length Write-Host "Adding $word (family $familyId)" } if ($this.entries.length -gt 300) { $this.Flush() $global:maxEntries = 0 } } Flush() { $languageId = $this.tisaneId Write-Progress -Activity "Saving language $languageId" try { $whatever = Invoke-WebRequest -Uri "$global:productionHost/setLanguage?language=$languageId" -Method POST -Headers $global:authorizationToken -Body ' ' -UseBasicParsing } catch { Write-Host "Error setting language: $_" -ForegroundColor Red break } $this.entries | ForEach-Object { #Write-Host "Saving language " + $this.tisaneId + ", family " + $_.familyId + ": " + $_.word $id = $_.familyId $wrd = $_.word $wdid = $_.wikidata try { Write-Progress -Activity "Saving language $languageId $wrd [$id]" $ack = Invoke-RestMethod -Uri "$global:productionHost/importFamilies?lexeme=$wrd&families=$id&behavior=f1&source=wikidata&orgId=$wdid" -Method POST -Headers $global:authorizationToken } catch { Write-Host "Error saving: $_" -ForegroundColor Red } } $this.entries = @() Invoke-Sqlcmd -Query "USE tisane; DELETE edits WHERE username = 'bulkimport'" # prevent log blowout Invoke-Sqlcmd -Query "USE tisane; update lexemes SET caninflect = 1 WHERE created > GETUTCDATE() - 0.45 AND caninflect = 0 AND sourcetype = 'wikidata'" # SELECT bywiki.Definition, l.* FROM dbo.Lexemes l INNER JOIN dbo.LexemeFamilies lf ON lf.LexemeId = l.Id INNER JOIN dbo.Families fa ON fa.Id = lf.FamilyId AND (fa.WikidataId <> l.SourceId OR fa.WikidataId IS NULL) INNER JOIN dbo.Families bywiki ON bywiki.WikidataId = l.SourceId WHERE l.Created > GETUTCDATE() - 1 AND l.SourceType = 'wikidata' } ImportedLanguage([string]$ietf, [int]$tisaneId) { $this.ietf = $ietf $this.tisaneId = $tisaneId $this.entries = @() } } $global:productionHost = 'https://lampws.tisane.ai:443' # #$pPass = ConvertFrom-SecureString $password # $authenticationBody = '["' + $user + '", "' + $password + '"]' $languages = Invoke-RestMethod -Uri "$global:productionHost/languages" -Method GET $toImport = @() $languages | Foreach-Object { if ($language) { if ($_.ISOCode -eq $language) { $toImport += @([ImportedLanguage]::new($_.ISOCode, $_.id)) } } else { if ($_.ISOCode -and $_.ISOCode -ne 'en') { if ($_.ISOCode -eq 'ps-AF') { $toImport += @([ImportedLanguage]::new('ps', $_.id)) } if ($_.ISOCode -eq 'zh-CN') { $toImport += @([ImportedLanguage]::new('zh', $_.id)) } if ($_.ISOCode -eq 'zh-TW') { $toImport += @([ImportedLanguage]::new('zh-tw', $_.id)) } $toImport += @([ImportedLanguage]::new($_.ISOCode, $_.id)) } } } # [Net.ServicePointManager]::SecurityProtocol = "tls12, tls11, tls" # [Net.ServicePointManager]::SecurityProtocol = [Net.SecurityProtocolType]::Tls12 -bor [Net.SecurityProtocolType]::Tls11 -bor [Net.SecurityProtocolType]::Tls # $productionAuthentication = Invoke-WebRequest -Uri "$global:productionHost/authenticate" -Method POST -Body $authenticationBody # $inJson = ConvertFrom-Json -InputObject $productionAuthentication.Content # $global:authorizationToken = @{} # $global:authorizationToken.Add('Authorization', $inJson.token) # $OutputEncoding = [console]::InputEncoding = [console]::OutputEncoding = New-Object System.Text.UTF8Encoding # remove wrong ones: DELETE l from lexemes l INNER JOIN lexemefamilies lf ON lf.lexemeid = l.id INNER JOIN families fa ON fa.id = lf.familyid AND NOT (fa.WikidataId = l.SourceId) WHERE l.created > GETUTCDATE() - 1.5 AND l.SourceType = 'wikidata' if ($language) { $langId = $toImport[0].tisaneId $families = Invoke-Sqlcmd -Query "USE tisane; SELECT Id, WikidataId, Description, IsProperNoun FROM Families f WHERE WikidataId IS NOT NULL AND NOT EXISTS (SELECT 1 FROM dbo.Lexemes l INNER JOIN dbo.LexemeFamilies lf ON lf.LexemeId = l.Id AND lf.FamilyId = f.Id WHERE l.LanguageId = $langId)" } else { $families = Invoke-Sqlcmd -Query "USE tisane; SELECT Id, WikidataId, Description, IsProperNoun FROM Families f WHERE WikidataId IS NOT NULL AND NOT EXISTS (SELECT 1 FROM dbo.Lexemes l WHERE l.SourceType = 'wikidata' AND l.SourceId = f.WikidataId)" } $i = 0 $updated = 0 $families | Foreach-Object { #$_ $familyId = $_.Id $wikidataId = $_.WikidataId $description = $_.Description $proper = $_.IsProperNoun $pct = $i / $families.length * 100 $i += 1 Write-Progress -Activity "[$familyId] $description max entries: $global:maxEntries" -Status "$pct% complete" -PercentComplete $pct #'Sorted: ' + $synonyms #'Id=' + $id + ', description=' + $description $fnd = @() $attempt = 0 $wikidataUrl = "https://www.wikidata.org/w/api.php?action=wbgetentities&ids=$wikidataId&format=json" $wikidataResponse = Invoke-WebRequest -Uri $wikidataUrl -Method GET -UseBasicParsing $wikidataResponse = $wikidataResponse -replace $wikidataId, 'wikiId' $wd = ConvertFrom-Json -InputObject $wikidataResponse $labels = $wd.entities.wikiId.labels $labels | Get-Member -Type Properties | Foreach-Object { $crnt = $_ # [pscustomobject]$_ $nm = $_.name $toImport | ForEach-Object { if ($nm -eq $_.ietf) { #"Adding " + $labels.$nm.value + " with " + $familyId $_.AddWord($labels.$nm.value, $familyId, $wikidataId, $proper) } } #$_.name + ' = ' + $labels.$nm.value } } $toImport | ForEach-Object { $_.Flush() } } #EndRegion '.\Public\Import-FamiliesByWikidataCategory.ps1' 213 #Region '.\Public\Import-FamiliesFromWikidataByCategory.ps1' 0 ## ============================================================================= ## ## This script's purpose is to import new families from Wikidata by hypernym family ## ## ============================================================================= function Import-FamiliesFromWikidataByCategory{ [CmdletBinding()] Param( # [Parameter(Mandatory = $true, valueFromPipeline=$true, HelpMessage="LaMP user: ")][String] $user, # [Parameter(Mandatory = $true, HelpMessage="LaMP password: ")][String] $password, [Parameter(Mandatory = $false, HelpMessage="Wikidata hypernym: ")][int] $family ) # SELECT fa.Definition, fa.Id, other.Id OtherId, other.Definition otherDefinition FROM dbo.Families fa INNER JOIN dbo.Families other ON other.WikidataId = fa.WikidataId AND other.Id > fa.Id WHERE fa.WikidataId IS NOT NULL ## duplicates ## 60000 was where it ended $global:maxEntries = 0 $ttr = New-Object System.Diagnostics.TextWriterTraceListener("C:\Tisane\wikidata.log") [System.Diagnostics.Trace]::Listeners.Add($ttr) [System.Diagnostics.Trace]::AutoFlush = $true ## overcome the HTTPS error: https://stackoverflow.com/questions/11696944/powershell-v3-invoke-webrequest-https-error # If (-not ("TrustAllCertsPolicy" -as [type])) { # Add-Type @" # using System.Net; # using System.Security.Cryptography.X509Certificates; # public class TrustAllCertsPolicy : ICertificatePolicy { # public bool CheckValidationResult( # ServicePoint srvPoint, X509Certificate certificate, # WebRequest request, int certificateProblem) { # return true; # } # } # "@ # } # [System.Net.ServicePointManager]::CertificatePolicy = New-Object TrustAllCertsPolicy $global:productionHost = 'https://lampws.tisane.ai:443' # #$pPass = ConvertFrom-SecureString $password # $authenticationBody = '["' + $user + '", "' + $password + '"]' Login-Lamp $languages = Invoke-RestMethod -Uri "$global:productionHost/languages" -Method GET $toImport = @() $languages | Foreach-Object { if ($language) { if ($_.ISOCode -eq $language) { $toImport += @([ImportedLanguage]::new($_.ISOCode, $_.id)) } } else { if ($_.ISOCode -and $_.ISOCode -ne 'en') { if ($_.ISOCode -eq 'ps-AF') { $toImport += @([ImportedLanguage]::new('ps', $_.id)) } if ($_.ISOCode -eq 'zh-CN') { $toImport += @([ImportedLanguage]::new('zh', $_.id)) } if ($_.ISOCode -eq 'zh-TW') { $toImport += @([ImportedLanguage]::new('zh-tw', $_.id)) } $toImport += @([ImportedLanguage]::new($_.ISOCode, $_.id)) } } } # [Net.ServicePointManager]::SecurityProtocol = "tls12, tls11, tls" # [Net.ServicePointManager]::SecurityProtocol = [Net.SecurityProtocolType]::Tls12 -bor [Net.SecurityProtocolType]::Tls11 -bor [Net.SecurityProtocolType]::Tls # $productionAuthentication = Invoke-WebRequest -Uri "$global:productionHost/authenticate" -Method POST -Body $authenticationBody # $inJson = ConvertFrom-Json -InputObject $productionAuthentication.Content # $global:authorizationToken = @{} # $global:authorizationToken.Add('Authorization', $inJson.token) # $OutputEncoding = [console]::InputEncoding = [console]::OutputEncoding = New-Object System.Text.UTF8Encoding $families = Invoke-Sqlcmd -Query "USE tisane; SELECT Id, WikidataId, Description FROM Families f WHERE f.Id = $family AND f.WikidataId IS NOT NULL" if (-not($families) -or $families.length -ne 1) { Write-Host "No suitable families found for ID: $family" -ForegroundColor Red break } $i = 0 $updated = 0 $families | ForEach-Object { $familyId = $_.Id $hypernymWikidataId = $_.WikidataId $description = $_.Description $wikidataUrl = "https://www.wikidata.org/w/api.php?action=query&prop=linkshere&format=json&lhprop=title&lhnamespace=0&lhlimit=500&titles=$hypernymWikidataId" $wikidataResponse = Invoke-WebRequest -Uri $wikidataUrl -Method GET -UseBasicParsing # must replace "pages":{"187588" -> "pages":{"this_page" $wikidataResponse.query.pages.this_page.linkshere | ForEach-Object { # 1. Retrieve the page. 2. See if already exists by Wikidata ID and Lexemes in LanguageId 7. 3. Save if new } $pct = $i / $families.length * 100 $i += 1 Write-Progress -Activity "[$familyId] $description max entries: $global:maxEntries" -Status "$pct% complete" -PercentComplete $pct #'Sorted: ' + $synonyms #'Id=' + $id + ', description=' + $description $fnd = @() $attempt = 0 $wikidataResponse = $wikidataResponse -replace $wikidataId, 'wikiId' $wd = ConvertFrom-Json -InputObject $wikidataResponse $labels = $wd.entities.wikiId.labels $descriptions = $wd.entities.wikiId.descriptions if ($descriptions -and $descriptions.en.value -eq 'Wikimedia disambiguation page') { Invoke-Sqlcmd -Query "USE tisane; UPDATE dbo.Families SET WikidataId = NULL, LastUpdatedByBatch = 'badwikidata_$wikidataId', LastBatchUpdate = GETUTCDATE() WHERE Id = $familyId" Write-Host "Removing Wikidata disambiguation page: $wikidataId ($familyId)" -ForegroundColor Yellow } else { $englishLabel = $labels.en.value -replace "'", "''" $lexemeIds = Invoke-Sqlcmd -Query "USE tisane; SELECT l.Id FROM dbo.Lexemes l INNER JOIN dbo.LexemeFamilies lf ON lf.LexemeId = l.Id AND lf.FamilyId = $familyId WHERE l.LanguageId = 7 AND l.MainLemma = N'$englishLabel' COLLATE SQL_Latin1_General_CP1_CI_AS" if ($familyId -gt 100000 -or $lexemeIds) { $labels | Get-Member -Type Properties | Foreach-Object { $crnt = $_ # [pscustomobject]$_ $nm = $_.name $toImport | ForEach-Object { if ($nm -eq $_.ietf) { #"Adding " + $labels.$nm.value + " with " + $familyId $_.AddWord($labels.$nm.value, $familyId, $wikidataId, $proper) } } #$_.name + ' = ' + $labels.$nm.value } } else { Invoke-Sqlcmd -Query "USE tisane; UPDATE dbo.Families SET WikidataId = NULL, LastUpdatedByBatch = 'badwikidata_$wikidataId', LastBatchUpdate = GETUTCDATE() WHERE Id = $familyId" Write-Host "Incorrect Wikidata: $wikidataId ($familyId)" -ForegroundColor Yellow } } } $toImport | ForEach-Object { $_.Flush() } } #EndRegion '.\Public\Import-FamiliesFromWikidataByCategory.ps1' 158 #Region '.\Public\Import-FeatureFromWiktionary.ps1' 0 ## ============================================================================= ## ## This script's purpose is to set features from Wiktionary. MUST BE RUN ON THE LAMP SERVER ## ## ============================================================================= function Import-FeatureFromWiktionary{ [CmdletBinding()] Param( # [Parameter(Mandatory = $true, valueFromPipeline=$true, HelpMessage="LaMP user: ")][String] $user, # [Parameter(Mandatory = $true, HelpMessage="LaMP password: ")][String] $password, [Parameter(Mandatory = $true, HelpMessage="Language code: ")][String] $language, [Parameter(Mandatory = $true, HelpMessage="Path: ")][String] $path, [Parameter(Mandatory = $true, HelpMessage="Part of speech: ")][String] $pos, [Parameter(Mandatory = $true, HelpMessage="Feature List ID: ")][int] $listId, [Parameter(Mandatory = $true, HelpMessage="Wiktionary labels: ")][String[]] $labels, [Parameter(Mandatory = $true, HelpMessage="Feature values: ")][String[]] $values, # an array of Tisane feature values [Parameter(Mandatory = $true, HelpMessage="Wiktionary category: ")][String] $category, [Parameter(Mandatory = $false, HelpMessage="Retag: ")][char] $retag, [Parameter(Mandatory = $false, HelpMessage="Allow touched: ")][boolean] $touched, [Parameter(Mandatory = $false, HelpMessage="Database: ")][String] $tisaneDb ) class Feature { [string]$index [string]$value [string]$type Feature([string]$index, [string]$value, [string]$type) { $this.index = $index $this.value = $value $this.type = $type } } if (-not($tisaneDb)) { $tisaneDb = 'tisane' } $config_path = $path + "Tisane.TestConsole.exe.config" [System.AppDomain]::CurrentDomain.SetData("APP_CONFIG_FILE", $config_path) Add-Type -AssemblyName System.Configuration [Configuration.ConfigurationManager].GetField("s_initState", "NonPublic, Static").SetValue($null, 0) [Configuration.ConfigurationManager].GetField("s_configSystem", "NonPublic, Static").SetValue($null, $null) ([Configuration.ConfigurationManager].Assembly.GetTypes() | where {$_.FullName -eq "System.Configuration.ClientConfigPaths"}).GetField("s_current", "NonPublic, Static").SetValue($null, $null) [Configuration.ConfigurationManager]::ConnectionStrings[0].Name [Reflection.Assembly]::LoadFrom($path + "Tisane.Runtime.dll") # $authenticationBody = '["' + $user + '", "' + $password + '"]' # [Net.ServicePointManager]::SecurityProtocol = "tls12, tls11, tls" # [Net.ServicePointManager]::SecurityProtocol = [Net.SecurityProtocolType]::Tls12 -bor [Net.SecurityProtocolType]::Tls11 -bor [Net.SecurityProtocolType]::Tls # $global:productionHost = 'https://lampws.tisane.ai:443' # $productionAuthentication = Invoke-WebRequest -Uri "$global:productionHost/authenticate" -Method POST -Body $authenticationBody -UseBasicParsing # $inJson = ConvertFrom-Json -InputObject $productionAuthentication.Content # $global:authorizationToken = @{} # $global:authorizationToken.Add('Authorization', $inJson.token) Login-Lamp $languageJSON = Invoke-RestMethod -Uri "$global:productionHost/languages" -Method GET -UseBasicParsing $languageNamesToCodes = @{} $languageID = 0 $languageJSON | foreach { if ($_.ISOCode -eq $language) { $languageID = $_.id $languageNamesToCodes.Add($_.englishName, $_.ISOCode) } } # $OutputEncoding = [console]::InputEncoding = [console]::OutputEncoding = New-Object System.Text.UTF8Encoding $featureIndex = $listId $featureIndexQueryResult = Invoke-Sqlcmd -Query "USE $tisaneDb; SELECT FeatureIndex FROM FeatureDefinitionLists fdl WHERE fdl.Id = $listId" if ($featureIndexQueryResult -and $featureIndexQueryResult[0]) { $featureIndex = $featureIndexQueryResult[0] } $whatever = Invoke-WebRequest -Uri "$global:lampHost/setLanguage?language=$languageID" -Method POST -Headers $global:authorizationToken -Body ' ' -UseBasicParsing $tisanePOSValue = $pos.ToUpper() switch ($tisanePOSValue) { 'ADJECTIVE' { $tisanePOSValue = 'ADJ' } 'ADVERB' { $tisanePOSValue = 'ADV' } 'PREPOSITION' { $tisanePOSValue = 'PREP' } 'CONJUNCTION' { $tisanePOSValue = 'CJ' } 'INTERJECTION' { $tisanePOSValue = 'INTJ' } } if ($retag) { $retag = [char]::ToLower($retag) } $tagsFound = @() $updated = 0 $articleCount = 0 $existCount = 0 do { $wikidataUrl = "https://en.wiktionary.org/w/api.php?action=query&generator=categorymembers&format=json&gcmtitle=Category:$category&prop=pageprops&gcmlimit=500&gcmcontinue=$bookmark" "Loading entries from $wikidataUrl" $wikidataResponse = Invoke-WebRequest -Uri $wikidataUrl -Method GET -UseBasicParsing $wikidataResponse = $wikidataResponse.Content # -replace '(?<=pages["]:{["])[^"]+', 'results' $listOfInstances = ConvertFrom-Json -InputObject $wikidataResponse if ($listOfInstances.continue) { $bookmark = $listOfInstances.continue.gcmcontinue } else { $bookmark = $null } #"Bookmark: $bookmark" $listOfInstances.query.pages.PSObject.Properties | foreach { if ($_.Value.title) { $articleCount += 1 $originalWord = $_.Value.title $pageId = $_.Value.pageid $actualWord = $originalWord -replace ' ', '_' $wiktionaryParser = New-Object Tisane.Helper.EnglishWiktionaryParser -ArgumentList ($actualWord, $languageNamesToCodes) $articleJSON = ($wiktionaryParser.ToJson().Value | Where Key -eq $pos).Value if (-not $articleJSON) { # $articleJSON = $wiktionaryParser.ToJson().Value $indexInLabelArray = -1 } else { $article = ConvertFrom-Json -InputObject $articleJSON.ToString() $tag = $article.tag $indexInLabelArray = [array]::IndexOf($labels, $tag) if ($indexInLabelArray -eq -1 -and $article.interpretations) { $article.interpretations | ForEach-Object { if ($indexInLabelArray -eq -1 -and $_.case) { $tag = $_.case[0] $indexInLabelArray = [array]::IndexOf($labels, $tag) } } } if ([array]::IndexOf($tagsFound, $tag) -lt 0) { $tagsFound += $tag } } if ($indexInLabelArray -gt -1) { $translatedTag = $values[$indexInLabelArray] $lexemeExistsWithoutCorrectFeatureSQL = "USE $tisaneDb; SELECT l.Id, (SELECT TOP 1 f.FeatureValue FROM dbo.Features f WHERE f.ConnectionType = 1 AND f.EntityID = l.ID AND f.FeatureListID = $listId) CurrentFeatureValue, l.LastUpdatedBy FROM dbo.Lexemes l WHERE l.LanguageID = $languageID AND l.MainLemma = N'$originalWord' AND EXISTS (SELECT TOP 1 1 FROM dbo.LexemeFamilies lf WHERE lf.LexemeID = l.ID AND EXISTS (SELECT TOP 1 1 FROM dbo.Features ff WHERE ff.ConnectionType = 2 AND ff.EntityID = lf.FamilyID AND ff.FeatureListID = 1 AND ff.FeatureValue = '$tisanePOSValue') )" #$lexemeExistsWithoutCorrectFeatureSQL $matchingLexemes = Invoke-Sqlcmd -Query $lexemeExistsWithoutCorrectFeatureSQL if ($matchingLexemes) { $existCount += 1 $matchingLexemes | ForEach-Object { $lexemeId = $_[0] $existingValue = $_[1] if ($existingValue -is [DBNull]) { $existingValue = $null } $lastUpdatedBy = $_[2] if ($lastUpdatedBy -is [DBNull] -or $lastUpdatedBy -eq 'bulkimport') { $lastUpdatedBy = $null } if ($lastUpdatedBy -and -not ($touched)) { Write-Host "Lexeme $originalWord (id $lexemeId) has been touched by a linguist and will not be updated" -ForegroundColor Red } else { # there is a lexeme we can and are allowed to update if ($retag -eq 'c') { # delete all features, it will be retagged $clearSQL = "USE $tisaneDb; DELETE dbo.Features WHERE ConnectionType = 1 AND EntityId = $lexemeId" Invoke-Sqlcmd -Query $clearSQL #$clearSQL $existingValue = $null } $updated += 1 $lexemeUpdateSQL = "USE $tisaneDb; UPDATE l SET l.LastBatchUpdate = GETUTCDATE(), l.LastUpdatedByBatch = 'importFeatureFromWiktionary_$category_$pageId' FROM dbo.Lexemes l WHERE l.Id = $lexemeId" if (-not $existingValue -or $existingValue -eq $null -or $existingValue -eq '') { # there's no existing value Write-Host "Updating $originalWord to $listID = $translatedTag ($tag); not set previously" -ForegroundColor Green #$updateSQL $insertSQL = "USE $tisaneDb; INSERT INTO dbo.Features (ConnectionType, EntityID, FeatureListID, FeatureValue) SELECT 1, $lexemeId, $listId, '$translatedTag'" #$insertSQL Invoke-Sqlcmd -Query $insertSQL Invoke-Sqlcmd -Query $lexemeUpdateSQL #$lexemeUpdateSQL } else { if ($existingValue -ne $translatedTag) { Write-Host "Updating $originalWord to $listID = $translatedTag ($tag). Existing value: $existingValue" -ForegroundColor Green $updateSQL = "USE $tisaneDb; UPDATE f SET f.FeatureValue = '$translatedTag' FROM dbo.Lexemes l INNER JOIN dbo.LexemeFamilies lf ON lf.LexemeID = l.ID AND EXISTS (SELECT TOP 1 1 FROM dbo.Features ff WHERE ff.ConnectionType = 2 AND ff.EntityID = lf.FamilyID AND ff.FeatureListID = 1 AND ff.FeatureValue = '$tisanePOSValue') INNER JOIN dbo.Features f ON f.ConnectionType = 1 AND f.EntityID = l.ID AND f.FeatureListID = $listId WHERE l.Id = $lexemeId" #$updateSQL Invoke-Sqlcmd -Query $updateSQL Invoke-Sqlcmd -Query $lexemeUpdateSQL #$lexemeUpdateSQL } } if ($retag) { #"Updating: " + $lexemeJson $response = Invoke-RestMethod -Uri "$global:productionHost/lexeme?id=$lexemeId" -Method GET -Headers $global:authorizationToken $grammar = @() $lexeme = @{ id=$lexemeId lemma=$originalWord stem=$originalWord grammar=$grammar } $grammar = $response.grammar # existing grammar; cleared before if needed #$grammar += @([Feature]::new($featureIndex, $translatedTag, "Grammar")) $lexeme.grammar = $grammar $lexemeJson = ConvertTo-Json -InputObject $lexeme #"Tagging: " + $lexemeJson $taggedLemma = Invoke-RestMethod -Uri "$global:productionHost/tagLemma" -Method POST -Headers $global:authorizationToken -Body ([System.Text.Encoding]::UTF8.GetBytes($lexemeJson)) #$taggedLemma $lexeme.grammar = $taggedLemma.grammar $lexeme.stem = $taggedLemma.stem if ($taggedLemma.style) { $lexeme.style = $taggedLemma.style } $lexeme.requestId = $response.requestId # need for the update request $lexemeJson = ConvertTo-Json -InputObject $lexeme $response = Invoke-RestMethod -Uri "$global:productionHost/lexeme" -Method PUT -Headers $global:authorizationToken -Body ([System.Text.Encoding]::UTF8.GetBytes($lexemeJson)) #$response } } } } else { Write-Host "Lexeme $originalWord does not exist" -ForegroundColor Red } } else { Write-Host "No valid mapped tag for $originalWord (original tag: $tag, lexeme $lexemeId)" -ForegroundColor Red } } # otherwise, it's likely less important } } while ($bookmark) "Tags encountered:" $tagsFound "Updated: $updated, number of articles: $articleCount, $existCount lexeme(s) exist" } #EndRegion '.\Public\Import-FeatureFromWiktionary.ps1' 242 #Region '.\Public\Import-FeatureFromWiktionaryDeclension.ps1' 0 ## ============================================================================= ## ## This script's purpose is to set features based on declension tables in the Wiktionary. MUST BE RUN ON THE LAMP SERVER ## ## It is done by comparison of a set of regexes applied on the lemma to a form identified by the specified label. ## If there's a match, then the feature value is assigned. ## ## EXAMPLE: PS C:\Tisane> .\importFeatureFromWiktionaryDeclension.ps1 -user bulkimport -password xxxxxx -path C:\Tisane\Test ## Console\ -language tr -pos Noun -category Turkish_nouns -label dative -listId 35 -replaceRegexes "(?<=[^ıiuüaoue])[ıiuü] ## (?=[^ıiuüaoue]$)","p$","ç$","t$","k$","k$" -replaceWith "","b","c","d","g","ÄŸ" -values "10","01","02","03","04","05" ## ============================================================================= function Import-FeatureFromWiktionaryDeclension{ [CmdletBinding()] Param( # [Parameter(Mandatory = $true, valueFromPipeline=$true, HelpMessage="LaMP user: ")][String] $user, # [Parameter(Mandatory = $true, HelpMessage="LaMP password: ")][String] $password, [Parameter(Mandatory = $true, HelpMessage="Language code: ")][String] $language, [Parameter(Mandatory = $true, HelpMessage="Path: ")][String] $path, [Parameter(Mandatory = $true, HelpMessage="Part of speech: ")][String] $pos, [Parameter(Mandatory = $true, HelpMessage="Wiktionary category: ")][String] $category, [Parameter(Mandatory = $true, HelpMessage="Wiktionary grammar feature: ")][String] $label, # the grammar label to look for in the declension table [Parameter(Mandatory = $true, HelpMessage="Target Feature List ID: ")][int] $listId, # the feature to set [Parameter(Mandatory = $true, HelpMessage="Regexes to replace: ")][String[]] $replaceRegexes, [Parameter(Mandatory = $true, HelpMessage="Replace with values: ")][AllowEmptyString()][String[]] $replaceWith, [Parameter(Mandatory = $true, HelpMessage="Feature values: ")][String[]] $values, # an array of Tisane feature values [Parameter(Mandatory = $false, HelpMessage="Retag: ")][char] $retag, [Parameter(Mandatory = $false, HelpMessage="Allow touched: ")][boolean] $touched, [Parameter(Mandatory = $false, HelpMessage="Database: ")][String] $dbase ) class Feature { [string]$index [string]$value [string]$type Feature([string]$index, [string]$value, [string]$type) { $this.index = $index $this.value = $value $this.type = $type } } if (-not($dbase)) { $dbase = 'tisane' } $config_path = $path + "Tisane.TestConsole.exe.config" [System.AppDomain]::CurrentDomain.SetData("APP_CONFIG_FILE", $config_path) Add-Type -AssemblyName System.Configuration [Configuration.ConfigurationManager].GetField("s_initState", "NonPublic, Static").SetValue($null, 0) [Configuration.ConfigurationManager].GetField("s_configSystem", "NonPublic, Static").SetValue($null, $null) ([Configuration.ConfigurationManager].Assembly.GetTypes() | where {$_.FullName -eq "System.Configuration.ClientConfigPaths"}).GetField("s_current", "NonPublic, Static").SetValue($null, $null) [Configuration.ConfigurationManager]::ConnectionStrings[0].Name [Reflection.Assembly]::LoadFrom($path + "Tisane.Runtime.dll") # $authenticationBody = '["' + $user + '", "' + $password + '"]' # [Net.ServicePointManager]::SecurityProtocol = "tls12, tls11, tls" # [Net.ServicePointManager]::SecurityProtocol = [Net.SecurityProtocolType]::Tls12 -bor [Net.SecurityProtocolType]::Tls11 -bor [Net.SecurityProtocolType]::Tls # $global:productionHost = 'https://lampws.tisane.ai:443' # $productionAuthentication = Invoke-WebRequest -Uri "$global:productionHost/authenticate" -Method POST -Body $authenticationBody -UseBasicParsing # $inJson = ConvertFrom-Json -InputObject $productionAuthentication.Content # $global:authorizationToken = @{} # $global:authorizationToken.Add('Authorization', $inJson.token) # $languageJSON = Invoke-RestMethod -Uri "$global:productionHost/languages" -Method GET # $languageNamesToCodes = @{} # $languageID = 0 # $languageJSON | foreach { # if ($_.ISOCode -eq $language) { # $languageID = $_.id # $languageNamesToCodes.Add($_.englishName, $_.ISOCode) # } # } # $OutputEncoding = [console]::InputEncoding = [console]::OutputEncoding = New-Object System.Text.UTF8Encoding Login-Lamp $featureIndex = $listId $featureIndexQueryResult = Invoke-Sqlcmd -Query "USE $dbase; SELECT FeatureIndex FROM FeatureDefinitionLists fdl WHERE fdl.Id = $listId" if ($featureIndexQueryResult -and $featureIndexQueryResult[0]) { $featureIndex = $featureIndexQueryResult[0] } $whatever = Invoke-WebRequest -Uri "$global:lampHost/setLanguage?language=$languageID" -Method POST -Headers $global:authorizationToken -Body ' ' -UseBasicParsing $tisanePOSValue = $pos.ToUpper() switch ($tisanePOSValue) { 'ADJECTIVE' { $tisanePOSValue = 'ADJ' } 'ADVERB' { $tisanePOSValue = 'ADV' } 'PREPOSITION' { $tisanePOSValue = 'PREP' } 'CONJUNCTION' { $tisanePOSValue = 'CJ' } 'INTERJECTION' { $tisanePOSValue = 'INTJ' } } if ($retag) { $retag = [char]::ToLower($retag) } $updated = 0 $articleCount = 0 $existCount = 0 do { $wikidataUrl = "https://en.wiktionary.org/w/api.php?action=query&generator=categorymembers&format=json&gcmtitle=Category:$category&prop=pageprops&gcmlimit=500&gcmcontinue=$bookmark" "Loading entries from $wikidataUrl" $wikidataResponse = Invoke-WebRequest -Uri $wikidataUrl -Method GET -UseBasicParsing $wikidataResponse = $wikidataResponse.Content # -replace '(?<=pages["]:{["])[^"]+', 'results' $listOfInstances = ConvertFrom-Json -InputObject $wikidataResponse if ($listOfInstances.continue) { $bookmark = $listOfInstances.continue.gcmcontinue } else { $bookmark = $null } #"Bookmark: $bookmark" $listOfInstances.query.pages.PSObject.Properties | foreach { if ($_.Value.title) { $articleCount += 1 $originalWord = $_.Value.title $pageId = $_.Value.pageid $actualWord = $originalWord -replace ' ', '_' #$actualWord $wiktionaryParser = New-Object Tisane.Helper.EnglishWiktionaryParser -ArgumentList ($actualWord, $languageNamesToCodes) $articleJSON = ($wiktionaryParser.ToJson().Value | Where Key -eq $pos).Value $indexInLabelArray = -1 $translatedTag = '' if ($articleJSON) { $article = ConvertFrom-Json -InputObject $articleJSON.ToString() $declensionTable = $article.inflection if ($declensionTable) { $declensionTable | ForEach-Object { $formText = $_.text $_.categories | ForEach-Object { if ($indexInLabelArray -eq -1 -and $_ -eq $label) { # that's the form!!! For ($i = 0; $indexInLabelArray -eq -1 -and $i -lt $replaceRegexes.length; $i++) { $rx = $replaceRegexes[$i] if ($originalWord -match $rx) { # the original word matches the regex... $rw = $replaceWith[$i] $modifiedLemma = $originalWord -replace $rx, $rw $modifiedLemmaWildcard = $modifiedLemma + '*' if ($formText -like $modifiedLemmaWildcard) { # ... and the modified lemma is found in the actual form $indexInLabelArray = $i $translatedTag = $values[$indexInLabelArray] Write-Host "Entry $originalWord -> $modifiedLemma : $rx -> $rw detected => $listId = $translatedTag" -ForegroundColor Green $indexInLabelArray = $i } } } } } } } } if ($indexInLabelArray -gt -1) { $lexemeExistsWithoutCorrectFeatureSQL = "USE $dbase; SELECT l.Id, (SELECT TOP 1 f.FeatureValue FROM dbo.Features f WHERE f.ConnectionType = 1 AND f.EntityID = l.ID AND f.FeatureListID = $listId) CurrentFeatureValue, l.LastUpdatedBy FROM dbo.Lexemes l WHERE l.LanguageID = $languageID AND l.MainLemma = N'$originalWord' AND EXISTS (SELECT TOP 1 1 FROM dbo.LexemeFamilies lf WHERE lf.LexemeID = l.ID AND EXISTS (SELECT TOP 1 1 FROM dbo.Features ff WHERE ff.ConnectionType = 2 AND ff.EntityID = lf.FamilyID AND ff.FeatureListID = 1 AND ff.FeatureValue = '$tisanePOSValue') )" #$lexemeExistsWithoutCorrectFeatureSQL $matchingLexemes = Invoke-Sqlcmd -Query $lexemeExistsWithoutCorrectFeatureSQL if ($matchingLexemes) { $existCount += 1 $matchingLexemes | ForEach-Object { $lexemeId = $_[0] $existingValue = $_[1] if ($existingValue -is [DBNull]) { $existingValue = $null } $lastUpdatedBy = $_[2] if ($lastUpdatedBy -is [DBNull] -or $lastUpdatedBy -eq 'bulkimport') { $lastUpdatedBy = $null } if ($lastUpdatedBy -and -not ($touched)) { Write-Host "Lexeme $originalWord (id $lexemeId) has been touched by a linguist and will not be updated" -ForegroundColor Red } else { # there is a lexeme we can and are allowed to update if ($retag -eq 'c') { # delete all features, it will be retagged $clearSQL = "USE $dbase; DELETE dbo.Features WHERE ConnectionType = 1 AND EntityId = $lexemeId" Invoke-Sqlcmd -Query $clearSQL #$clearSQL $existingValue = $null } $updated += 1 $lexemeUpdateSQL = "USE $dbase; UPDATE l SET l.LastBatchUpdate = GETUTCDATE(), l.LastUpdatedByBatch = 'importFeatureFromWiktionaryDeclension_$category_$pageId' FROM dbo.Lexemes l WHERE l.Id = $lexemeId" if (-not $existingValue -or $existingValue -eq $null -or $existingValue -eq '') { # there's no existing value Write-Host "Updating $originalWord to $listID = $translatedTag; not set previously" -ForegroundColor Green #$updateSQL $insertSQL = "USE $dbase; INSERT INTO dbo.Features (ConnectionType, EntityID, FeatureListID, FeatureValue) SELECT 1, $lexemeId, $listId, '$translatedTag'" #$insertSQL Invoke-Sqlcmd -Query $insertSQL Invoke-Sqlcmd -Query $lexemeUpdateSQL #$lexemeUpdateSQL } else { if ($existingValue -ne $translatedTag) { Write-Host "Updating $originalWord to $listID = $translatedTag. Existing value: $existingValue" -ForegroundColor Green $updateSQL = "USE $dbase; UPDATE f SET f.FeatureValue = '$translatedTag' FROM dbo.Lexemes l INNER JOIN dbo.LexemeFamilies lf ON lf.LexemeID = l.ID AND EXISTS (SELECT TOP 1 1 FROM dbo.Features ff WHERE ff.ConnectionType = 2 AND ff.EntityID = lf.FamilyID AND ff.FeatureListID = 1 AND ff.FeatureValue = '$tisanePOSValue') INNER JOIN dbo.Features f ON f.ConnectionType = 1 AND f.EntityID = l.ID AND f.FeatureListID = $listId WHERE l.Id = $lexemeId" #$updateSQL Invoke-Sqlcmd -Query $updateSQL Invoke-Sqlcmd -Query $lexemeUpdateSQL #$lexemeUpdateSQL } } if ($retag) { #"Updating: " + $lexemeJson $response = Invoke-RestMethod -Uri "$global:productionHost/lexeme?id=$lexemeId" -Method GET -Headers $global:authorizationToken $grammar = @() $lexeme = @{ id=$lexemeId lemma=$originalWord stem=$originalWord grammar=$grammar } $grammar = $response.grammar # existing grammar; cleared before if needed #$grammar += @([Feature]::new($featureIndex, $translatedTag, "Grammar")) $lexeme.grammar = $grammar $lexemeJson = ConvertTo-Json -InputObject $lexeme #"Tagging: " + $lexemeJson $taggedLemma = Invoke-RestMethod -Uri "$global:productionHost/tagLemma" -Method POST -Headers $global:authorizationToken -Body ([System.Text.Encoding]::UTF8.GetBytes($lexemeJson)) #$taggedLemma $lexeme.grammar = $taggedLemma.grammar $lexeme.stem = $taggedLemma.stem if ($taggedLemma.style) { $lexeme.style = $taggedLemma.style } $lexeme.requestId = $response.requestId # need for the update request $lexemeJson = ConvertTo-Json -InputObject $lexeme $response = Invoke-RestMethod -Uri "$global:productionHost/lexeme" -Method PUT -Headers $global:authorizationToken -Body ([System.Text.Encoding]::UTF8.GetBytes($lexemeJson)) #$response } } } } else { Write-Host "Lexeme $originalWord does not exist" -ForegroundColor Red } } } # otherwise, it's likely less important } } while ($bookmark) "Updated: $updated, number of articles: $articleCount, $existCount lexeme(s) exist" } #EndRegion '.\Public\Import-FeatureFromWiktionaryDeclension.ps1' 254 #Region '.\Public\Import-Wikidata.ps1' 0 ## ============================================================================= ## ## This script's purpose is to import new entries from Wikidata for all the supported languages ## ## ============================================================================= function Import-Wikidata{ [CmdletBinding()] Param( # [Parameter(Mandatory = $true, valueFromPipeline=$true, HelpMessage="LaMP user: ")][String] $user, # [Parameter(Mandatory = $true, HelpMessage="LaMP password: ")][String] $password, [Parameter(Mandatory = $false, HelpMessage="Language: ")][String] $language, [Parameter(Mandatory = $false, HelpMessage="Changes: ")][boolean] $changes ) # SELECT fa.Definition, fa.Id, other.Id OtherId, other.Definition otherDefinition FROM dbo.Families fa INNER JOIN dbo.Families other ON other.WikidataId = fa.WikidataId AND other.Id > fa.Id WHERE fa.WikidataId IS NOT NULL ## duplicates ## 60000 was where it ended $global:maxEntries = 0 $ttr = New-Object System.Diagnostics.TextWriterTraceListener("C:\Tisane\wikidata.log") [System.Diagnostics.Trace]::Listeners.Add($ttr) [System.Diagnostics.Trace]::AutoFlush = $true # . ".\normalizationLib.ps1" ## overcome the HTTPS error: https://stackoverflow.com/questions/11696944/powershell-v3-invoke-webrequest-https-error # If (-not ("TrustAllCertsPolicy" -as [type])) { # Add-Type @" # using System.Net; # using System.Security.Cryptography.X509Certificates; # public class TrustAllCertsPolicy : ICertificatePolicy { # public bool CheckValidationResult( # ServicePoint srvPoint, X509Certificate certificate, # WebRequest request, int certificateProblem) { # return true; # } # } # "@ # } # [System.Net.ServicePointManager]::CertificatePolicy = New-Object TrustAllCertsPolicy Login-Lamp $global:productionHost = 'https://lampws.tisane.ai:443' class LexiconEntry { [string]$word [int]$familyId [string]$wikidata #[string]$mweSQL LexiconEntry([string]$word, [int]$familyId, [string]$wikidata) { $this.word = $word $this.familyId = $familyId $this.wikidata = $wikidata #$this.mweSQL = '' } } class ImportedLanguage { [string]$ietf [int]$tisaneId $entries AddWord([string]$word, [int]$familyId, [string]$wikidata, [boolean]$proper) { try { #Write-Host "AddWord $word" if (-not $proper -and -not ($this.ietf -eq 'de')) { if (-not $word.ToLower().Equals($word) -and $word.Substring(1).Equals($word.Substring(1).ToLower())) { # uncapitalize $word = $word.ToLower() } } $languageCode = $this.ietf $toRemovePos = $word.IndexOf('(') if ($toRemovePos -gt 0) { $word = $word.Substring(0, $toRemovePos).Trim() } #Write-Host "Before GetNormalizedWikidataLemma $word" $word = GetNormalizedWikidataLemma -language $this.ietf -entry $word #Write-Host "After GetNormalizedWikidataLemma $word" $newLexeme = @([LexiconEntry]::new($word, $familyId, $wikidata)) if ($word.Trim() -like '* *') { $mweSegments = GetSegmentSQL -language $this.ietf -entry $word.Trim() if ($mweSegments) { $newLexeme | Add-Member -MemberType NoteProperty -Force -Name 'mweSQL' -Value $mweSegments #$newLexeme.mweSQL = $mweSegments } } #Write-Host "Adding $word (family $familyId)" $this.entries += $newLexeme if ($this.entries.length -gt $global:maxEntries) { $global:maxEntries = $this.entries.length Write-Host "Adding ($languageCode) $word (family $familyId)" } } catch { Write-Host "Error saving: $_" -ForegroundColor Red } if ($this.entries.length -gt 300) { $this.Flush() $global:maxEntries = 0 } } Flush() { $languageId = $this.tisaneId $entryCount = $this.entries.length Write-Progress -Activity "Saving language $languageId - $entryCount entries" try { $whatever = Invoke-WebRequest -Uri "$global:productionHost/setLanguage?language=$languageId" -Method POST -Headers $global:authorizationToken -Body ' ' -UseBasicParsing } catch { Write-Host "Error setting language: $_" -ForegroundColor Red break } $this.entries | ForEach-Object { #Write-Host "Saving language " + $this.tisaneId + ", family " + $_.familyId + ": " + $_.word $id = $_.familyId $wrd = $_.word $wdid = $_.wikidata try { #TODO: integrate segmentation cues Write-Progress -Activity "Saving language $languageId $wrd [$id]" $ack = Invoke-RestMethod -Uri "$global:productionHost/importFamilies?lexeme=$wrd&families=$id&behavior=f1&source=wikidata&orgId=$wdid" -Method POST -Headers $global:authorizationToken } catch { Write-Host "Error saving: $_" -ForegroundColor Red } } $this.entries = @() Invoke-Sqlcmd -Query "USE tisane; DELETE edits WHERE username = 'bulkimport'" # prevent log blowout Invoke-Sqlcmd -Query "USE tisane; update lexemes SET caninflect = 1 WHERE created > GETUTCDATE() - 0.45 AND caninflect = 0 AND sourcetype = 'wikidata'" # SELECT bywiki.Definition, l.* FROM dbo.Lexemes l INNER JOIN dbo.LexemeFamilies lf ON lf.LexemeId = l.Id INNER JOIN dbo.Families fa ON fa.Id = lf.FamilyId AND (fa.WikidataId <> l.SourceId OR fa.WikidataId IS NULL) INNER JOIN dbo.Families bywiki ON bywiki.WikidataId = l.SourceId WHERE l.Created > GETUTCDATE() - 1 AND l.SourceType = 'wikidata' # UPDATE lf SET lf.FamilyId = bywiki.Id, lf.LastUpdatedBy = 'vadim_sql_wiki_fix', lf.LastUpdate = GETUTCDATE() FROM dbo.Lexemes l INNER JOIN dbo.LexemeFamilies lf ON lf.LexemeId = l.Id INNER JOIN dbo.Families fa ON fa.Id = lf.FamilyId AND (fa.WikidataId <> l.SourceId OR fa.WikidataId IS NULL) INNER JOIN dbo.Families bywiki ON bywiki.WikidataId = l.SourceId AND NOT EXISTS (SELECT TOP 1 1 FROM dbo.LexemeFamilies bywikilf WHERE bywikilf.FamilyId = bywiki.Id AND bywikilf.LexemeId = l.Id) WHERE l.Created > GETUTCDATE() - 25 AND l.SourceType = 'wikidata'; # DELETE l FROM dbo.Lexemes l INNER JOIN dbo.LexemeFamilies lf ON lf.LexemeId = l.Id INNER JOIN dbo.Families fa ON fa.Id = lf.FamilyId AND (fa.WikidataId <> l.SourceId OR fa.WikidataId IS NULL) WHERE l.Created > GETUTCDATE() - 30 AND l.SourceType = 'wikidata'; } ImportedLanguage([string]$ietf, [int]$tisaneId) { $this.ietf = $ietf $this.tisaneId = $tisaneId $this.entries = @() } } #$pPass = ConvertFrom-SecureString $password # $authenticationBody = '["' + $user + '", "' + $password + '"]' [Net.ServicePointManager]::SecurityProtocol = "Tls12, Tls11, Tls, Ssl3" $languages = Invoke-RestMethod -Uri "$global:productionHost/languages" -Method GET $toImport = @() $languages | Foreach-Object { if (-not($language) -or $_.ISOCode -eq $language) { if ($_.ISOCode -and $_.ISOCode -ne 'en') { if ($_.ISOCode -eq 'no') { $toImport += @([ImportedLanguage]::new('nn', $_.id)) } if ($_.ISOCode -eq 'pt') { $toImport += @([ImportedLanguage]::new('pt-br', $_.id)) } if ($_.ISOCode -eq 'ps-AF') { $toImport += @([ImportedLanguage]::new('ps', $_.id)) } if ($_.ISOCode -eq 'zh-TW') { $toImport += @([ImportedLanguage]::new('zh-hk', $_.id)) $toImport += @([ImportedLanguage]::new('zh-tw', $_.id)) $toImport += @([ImportedLanguage]::new('zh-hant', $_.id)) } if ($_.ISOCode -eq 'zh-CN') { $toImport += @([ImportedLanguage]::new('zh', $_.id)) $toImport += @([ImportedLanguage]::new('zh-cn', $_.id)) $toImport += @([ImportedLanguage]::new('zh-sg', $_.id)) $toImport += @([ImportedLanguage]::new('zh-hans', $_.id)) } if ($_.ISOCode -eq 'yue') { $toImport += @([ImportedLanguage]::new('yue', $_.id)) $toImport += @([ImportedLanguage]::new('zh-yue', $_.id)) } $toImport += @([ImportedLanguage]::new($_.ISOCode, $_.id)) } } } # [Net.ServicePointManager]::SecurityProtocol = "tls12, tls11, tls" # [Net.ServicePointManager]::SecurityProtocol = [Net.SecurityProtocolType]::Tls12 -bor [Net.SecurityProtocolType]::Tls11 -bor [Net.SecurityProtocolType]::Tls # $productionAuthentication = Invoke-uest -Uri "$global:productionHost/authenticate" -Method POST -Body $authenticationBody -UseBasicParsing # $inJson = ConvertFrom-Json -InputObject $productionAuthentication.Content # $global:authorizationToken = @{} # $global:authorizationToken.Add('Authorization', $inJson.token) # $OutputEncoding = [console]::InputEncoding = [console]::OutputEncoding = New-Object System.Text.UTF8Encoding # remove wrong ones: DELETE l from lexemes l INNER JOIN lexemefamilies lf ON lf.lexemeid = l.id INNER JOIN families fa ON fa.id = lf.familyid AND NOT (fa.WikidataId = l.SourceId) WHERE l.created > GETUTCDATE() - 1.5 AND l.SourceType = 'wikidata' $addOnSql = "" if ($changes) { Invoke-Sqlcmd -Query "USE tisane; UPDATE lf SET lf.FamilyId = bywiki.Id, lf.LastUpdatedBy = 'sql_wikidata_fix', lf.LastUpdate = GETUTCDATE() FROM dbo.Lexemes l INNER JOIN dbo.LexemeFamilies lf ON lf.LexemeId = l.Id INNER JOIN dbo.Families fa ON fa.Id = lf.FamilyId AND (fa.WikidataId <> l.SourceId OR fa.WikidataId IS NULL) INNER JOIN dbo.Families bywiki ON bywiki.WikidataId = l.SourceId AND NOT EXISTS (SELECT TOP 1 1 FROM dbo.LexemeFamilies bywikilf WHERE bywikilf.FamilyId = bywiki.Id AND bywikilf.LexemeId = l.Id) WHERE l.Created > GETUTCDATE() - 25 AND l.SourceType = 'wikidata'; DELETE l FROM dbo.Lexemes l INNER JOIN dbo.LexemeFamilies lf ON lf.LexemeId = l.Id INNER JOIN dbo.Families fa ON fa.Id = lf.FamilyId AND (fa.WikidataId <> l.SourceId OR fa.WikidataId IS NULL) WHERE l.Created > GETUTCDATE() - 30 AND l.SourceType = 'wikidata'" $addOnSql = " AND (f.LastUpdate > GETUTCDATE() - 20 OR f.Created > GETUTCDATE() - 20)" } if ($language) { $langId = $toImport[0].tisaneId $sql = "USE tisane; SELECT Id, WikidataId, Description, IsProperNoun FROM Families f WHERE LEN(WikidataId) > 1 AND NOT EXISTS (SELECT 1 FROM dbo.Lexemes l INNER JOIN dbo.LexemeFamilies lf ON lf.LexemeId = l.Id AND lf.FamilyId = f.Id WHERE l.LanguageId = $langId ) $addOnSql" $families = Invoke-Sqlcmd -Query $sql } else { #$families = Invoke-Sqlcmd -Query "USE tisane; SELECT Id, WikidataId, Description, IsProperNoun FROM Families f WHERE WikidataId IS NOT NULL AND NOT EXISTS (SELECT 1 FROM dbo.Lexemes l WHERE l.SourceType = 'wikidata' AND l.SourceId = f.WikidataId)" $families = Invoke-Sqlcmd -Query "USE tisane; SELECT Id, WikidataId, Description, IsProperNoun FROM Families f WHERE LEN(WikidataId) > 1 $addOnSql" } $i = 0 $updated = 0 $families | Foreach-Object { #$_ $familyId = $_.Id $wikidataId = $_.WikidataId $description = $_.Description $proper = $_.IsProperNoun $pct = $i / $families.length * 100 $i += 1 Write-Progress -Activity "[$familyId] $description max entries: $global:maxEntries" -Status "$pct% complete" -PercentComplete $pct #'Sorted: ' + $synonyms #'Id=' + $id + ', description=' + $description $fnd = @() $attempt = 0 $wikidataUrl = "https://www.wikidata.org/w/api.php?action=wbgetentities&ids=$wikidataId&format=json" $wikidataResponse = Invoke-WebRequest -Uri $wikidataUrl -Method GET # -UseBasicParsing $wikidataResponse = $wikidataResponse -replace $wikidataId, "wikiId" $wd = ConvertFrom-Json -InputObject $wikidataResponse $labels = $wd.entities.wikiId.labels $descriptions = $wd.entities.wikiId.descriptions if ($descriptions -and $descriptions.en.value -eq 'Wikimedia disambiguation page1') { Invoke-Sqlcmd -Query "USE tisane; UPDATE dbo.Families SET WikidataId = NULL, LastUpdatedByBatch = 'badwikidata_$wikidataId', LastBatchUpdate = GETUTCDATE() WHERE Id = $familyId" Write-Host "Removing Wikidata disambiguation page: $wikidataId ($familyId)" -ForegroundColor Yellow } else { $filledLanguages = Invoke-Sqlcmd -Query "USE tisane; SELECT l.ISOCode FROM dbo.LexemeFamilies lf INNER JOIN dbo.Lexemes lx ON lx.Id = lf.LexemeId INNER JOIN dbo.Languages l ON l.Id = lx.LanguageId AND lx.SourceType = 'wikidata' WHERE lf.FamilyId = $familyId" $englishLabel = $labels.en.value -replace "'", "''" $lexemeIds = 1 #Invoke-Sqlcmd -Query "USE tisane; SELECT l.Id FROM dbo.Lexemes l INNER JOIN dbo.LexemeFamilies lf ON lf.LexemeId = l.Id AND lf.FamilyId = $familyId WHERE l.LanguageId = 7 AND l.MainLemma = N'$englishLabel' COLLATE SQL_Latin1_General_CP1_CI_AS" if ($labels -and ($familyId -gt 100000 -or $lexemeIds)) { $labels | Get-Member -Type Properties | Foreach-Object { $crnt = $_ # [pscustomobject]$_ $nm = $_.name $wikidataEntry = $labels.$nm.value $alreadyExists = $false $filledLanguages | ForEach-Object { if ($nm -eq $_.ISOCode) { if (-not($language) -or $language -eq $_.ISOCode) { $currentLanguageName = $_.englishName #"$wikidataEntry already exists in $currentLanguageName" } $alreadyExists = $true } } if (-not $alreadyExists) { $toImport | ForEach-Object { if ($nm -eq $_.ietf -and $labels.$nm.value -notlike '*,*' -and $labels.$nm.value -notlike '*/*') { #"Adding $wikidataEntry with $familyId" $_.AddWord($wikidataEntry, $familyId, $wikidataId, $proper) } } } #$_.name + ' = ' + $labels.$nm.value } } else { #Invoke-Sqlcmd -Query "USE tisane; UPDATE dbo.Families SET WikidataId = NULL, LastUpdatedByBatch = 'badwikidata_$wikidataId', LastBatchUpdate = GETUTCDATE() WHERE Id = $familyId" Write-Host "Incorrect Wikidata: $wikidataId ($familyId)?" -ForegroundColor Yellow } } } $toImport | ForEach-Object { $_.Flush() } # remove duplicated names Invoke-Sqlcmd -Query "USE tisane; DELETE l FROM Lexemes l INNER JOIN LexemeFamilies lf ON lf.LexemeId = l.Id AND EXISTS (SELECT TOP 1 1 FROM LexemeFamilies elf INNER JOIN Lexemes el ON el.Id = elf.LexemeId AND el.LanguageId = 7 AND el.MainLemma = l.MainLemma WHERE elf.FamilyId = lf.FamilyId) INNER JOIN Families f ON f.Id = lf.FamilyId AND f.IsProperNoun = 1 AND f.Definition LIKE '%name%' WHERE l.Created > GETUTCDATE() - 0.1 AND l.SourceType = 'wikidata'" } #EndRegion '.\Public\Import-Wikidata.ps1' 298 #Region '.\Public\Import-WikidataDomain.ps1' 0 ## ============================================================================= ## ## This script's purpose is to import new entries from Wikidata based on a category in Wikipedia ## ## ============================================================================= function Import-WikidataDomain{ [CmdletBinding()] Param( # [Parameter(Mandatory = $true, valueFromPipeline=$true, HelpMessage="LaMP user: ")][String] $user, # [Parameter(Mandatory = $true, HelpMessage="LaMP password: ")][String] $password, [Parameter(Mandatory = $false, HelpMessage="Domain ID: ")][int] $domain, [Parameter(Mandatory = $false, HelpMessage="Filter Hypernym Wikidata: ")][String] $instanceOf, [Parameter(Mandatory = $true, HelpMessage="Filter Hypernym ID: ")][int] $hypernym, [Parameter(Mandatory = $true, HelpMessage="Domain Wikidata: ")][String] $category, [Parameter(Mandatory = $false, HelpMessage="Remove: ")][String] $remove, [Parameter(Mandatory = $false, HelpMessage="Start Family ID: ")][int] $startFamily ) # $authenticationBody = '["' + $user + '", "' + $password + '"]' # [Net.ServicePointManager]::SecurityProtocol = "tls12, tls11, tls" # [Net.ServicePointManager]::SecurityProtocol = [Net.SecurityProtocolType]::Tls12 -bor [Net.SecurityProtocolType]::Tls11 -bor [Net.SecurityProtocolType]::Tls # $global:productionHost = 'https://lampws.tisane.ai:443' # $productionAuthentication = Invoke-WebRequest -Uri "$global:productionHost/authenticate" -Method POST -Body $authenticationBody -UseBasicParsing # $inJson = ConvertFrom-Json -InputObject $productionAuthentication.Content # $global:authorizationToken = @{} # $global:authorizationToken.Add('Authorization', $inJson.token) # $OutputEncoding = [console]::InputEncoding = [console]::OutputEncoding = New-Object System.Text.UTF8Encoding Login-Lamp if (-not($category)) { $sql = "USE tisane; SELECT Id, WikidataId FROM Families f WHERE Id = $domain" $family = Invoke-Sqlcmd -Query $sql $category = $family[1] } if (-not($instanceOf)) { $sql = "USE tisane; SELECT Id, WikidataId FROM Families f WHERE Id = $hypernym" $family = Invoke-Sqlcmd -Query $sql $instanceOf = $family[1] } $sql = "USE tisane; SELECT FeatureValue FROM Features f WHERE ConnectionType = 2 AND EntityId = $hypernym AND FeatureListId = 22" $f22Value = Invoke-Sqlcmd -Query $sql $isPerson = 0 if ($f22Value -eq 'PERS' -or $f22Value[0] -eq 'PERS') { $isPerson = 1 } if (-not $startFamily) { $startFamily = 125000 } if ($startFamily -lt $hypernym) { $startFamily = $hypernym } if ($startFamily -lt $domain) { $startFamily = $domain } #$wikidataUrl = "https://www.wikidata.org/w/api.php?action=query&prop=linkshere&format=json&lhprop=title&lhnamespace=0&lhlimit=500&titles=$category" $wikidataUrl = "https://en.wikipedia.org/w/api.php?action=query&generator=categorymembers&format=json&gcmtitle=Category:$category&prop=pageprops&gcmlimit=500" $wikidataResponse = Invoke-WebRequest -Uri $wikidataUrl -Method GET -UseBasicParsing $wikidataResponse = $wikidataResponse.Content # -replace '(?<=pages["]:{["])[^"]+', 'results' $listOfInstances = ConvertFrom-Json -InputObject $wikidataResponse #$listOfInstances.query.pages.results.linkshere | Foreach-Object { $listOfInstances.query.pages.PSObject.Properties | foreach { if ($_.Value.pageprops.wikibase_item) { $wikidataId = $_.Value.pageprops.wikibase_item $mainLabel = $_.Value.title if (-not($mainLabel -like '*Wiki*' -or $mainLabel -like 'List *' -or $mainLabel -like 'Category*' -or $mainLabel -like '* (*' -or $mainLabel -like 'The *')) { if ($mainLabel.Contains("(")) { $mainLabel = $mainLabel.Substring(0, $mainLabel.IndexOf(' (')).Trim() } if ($mainLabel.IndexOf($remove) -gt -1) { $mainLabel = $mainLabel -replace $remove, '' $mainLabel = $mainLabel.Trim() } Write-Host "[$wikidataId] $mainLabel" $sql = "USE tisane; SELECT TOP 1 Id FROM Families f WHERE WikidataId = '$wikidataId' OR EXISTS (SELECT TOP 1 1 FROM dbo.Lexemes l INNER JOIN dbo.LexemeFamilies lf ON lf.LexemeId = l.Id AND lf.FamilyId = f.Id WHERE l.LanguageId = 7 AND l.MainLemma = '$mainLabel')" $family = Invoke-Sqlcmd -Query $sql if ($family -and $family[0]) { $existingFamily = $family[0] Write-Host "[$wikidataId] $mainLabel already exists as family $existingFamily, won't add again" -ForegroundColor Red } else { $englishLabels = @() $englishLabels += $mainLabel $wikidataUrl = "https://www.wikidata.org/w/api.php?action=wbgetentities&ids=$wikidataId&format=json" $wikidataResponse = Invoke-WebRequest -Uri $wikidataUrl -Method GET -UseBasicParsing $wikidataResponse = $wikidataResponse -replace $wikidataId, "wikiId" $wd = ConvertFrom-Json -InputObject $wikidataResponse if ($wd.entities.wikiId.descriptions) { $definition = $wd.entities.wikiId.descriptions.en.value } else { $definition = '???' } $instanceOfNode = $wd.entities.wikiId.claims.P31 $subclassOf = $wd.entities.wikiId.claims.P279 if ($instanceOf -ne 'Q' -and (-not($instanceOfNode) -or -not($instanceOfNode.mainsnak.datavalue.value.id -eq $instanceOf)) -and (-not($subclassOf) -or -not($subclassOf.mainsnak.datavalue.value.id -eq $instanceOf))){ Write-Host "[$wikidataId] $mainLabel ($definition) skipping - not an instance of $instanceOf - $instanceOfNode" -ForegroundColor Red } else { if ($wd.entities.wikiId.aliases) { $wd.entities.wikiId.aliases.en | ForEach-Object { if ($_ -and $_.value -and $_.value.Length -gt 0 -and -not($_.value -like 'The *') -and -not($_.value -like '* (*')) { $englishLabels += $_.value } } } New-NounFamily -hypernym $hypernym -family $startFamily -definition $definition -english $englishLabels -person $isPerson -proper 1 -wikidata $wikidataId -domain $domain } } } } # otherwise, it's likely less important } } #EndRegion '.\Public\Import-WikidataDomain.ps1' 116 #Region '.\Public\Set-MWEindex.ps1' 0 ## ============================================================================= ## ## This script's purpose is to import new entries from Wikidata for all the supported languages ## ## ============================================================================= function Set-MWEindex{ [CmdletBinding()] Param( # [Parameter(Mandatory = $true, valueFromPipeline=$true, HelpMessage="LaMP user: ")][String] $user, # [Parameter(Mandatory = $true, HelpMessage="LaMP password: ")][String] $password, [Parameter(Mandatory = $false, HelpMessage="Language: ")][String] $language, [Parameter(Mandatory = $false, HelpMessage="Changes: ")][boolean] $changes ) # SELECT fa.Definition, fa.Id, other.Id OtherId, other.Definition otherDefinition FROM dbo.Families fa INNER JOIN dbo.Families other ON other.WikidataId = fa.WikidataId AND other.Id > fa.Id WHERE fa.WikidataId IS NOT NULL ## duplicates ## 60000 was where it ended $global:maxEntries = 0 $ttr = New-Object System.Diagnostics.TextWriterTraceListener("C:\Tisane\wikidata.log") [System.Diagnostics.Trace]::Listeners.Add($ttr) [System.Diagnostics.Trace]::AutoFlush = $true ## overcome the HTTPS error: https://stackoverflow.com/questions/11696944/powershell-v3-invoke-webrequest-https-error # If (-not ("TrustAllCertsPolicy" -as [type])) { # Add-Type @" # using System.Net; # using System.Security.Cryptography.X509Certificates; # public class TrustAllCertsPolicy : ICertificatePolicy { # public bool CheckValidationResult( # ServicePoint srvPoint, X509Certificate certificate, # WebRequest request, int certificateProblem) { # return true; # } # } # "@ # } Login-Lamp [System.Net.ServicePointManager]::CertificatePolicy = New-Object TrustAllCertsPolicy if ($language) { $langId = $toImport[0].tisaneId $sql = "USE tisane; SELECT Id, WikidataId, Description, IsProperNoun FROM Families f WHERE LEN(WikidataId) > 1 AND NOT EXISTS (SELECT 1 FROM dbo.Lexemes l INNER JOIN dbo.LexemeFamilies lf ON lf.LexemeId = l.Id AND lf.FamilyId = f.Id WHERE l.LanguageId = $langId ) $addOnSql" $families = Invoke-Sqlcmd -Query $sql } else { #$families = Invoke-Sqlcmd -Query "USE tisane; SELECT Id, WikidataId, Description, IsProperNoun FROM Families f WHERE WikidataId IS NOT NULL AND NOT EXISTS (SELECT 1 FROM dbo.Lexemes l WHERE l.SourceType = 'wikidata' AND l.SourceId = f.WikidataId)" $families = Invoke-Sqlcmd -Query "USE tisane; SELECT Id, WikidataId, Description, IsProperNoun FROM Families f WHERE LEN(WikidataId) > 1 $addOnSql" } } #EndRegion '.\Public\Set-MWEindex.ps1' 55 #Region '.\Public\Tisane-NightlyBuild.ps1' 0 ## ============================================================================= ## ## This script's purpose is to compile all production-ready languages ## ## ============================================================================= function Tisane-NightlyBuild{ Add-Type -Assembly "System.Io.Compression.FileSystem" If (Test-Path "C:\Tisane\TisaneLaMP.log"){ Remove-Item "C:\Tisane\TisaneLaMP.log" } # increase available memory for Japanese, Korean, Turkish, Russian, etc. Set-Item WSMan:\localhost\Shell\MaxMemoryPerShellMB 4096 taskkill /IM TisaneDBViewer.exe /F taskkill /IM HeidiSQL.exe /F /T "Compiling" $sqlCPUUsage = (Get-Counter '\Process(sqlservr)\% Processor Time').CounterSamples.CookedValue if ($sqlCPUUsage -gt 30) { Stop-Service "Tisane LaMP *LaMP*" Start-Service "Tisane LaMP *LaMP*" } C:\Tisane\tisaneCompiler xling taskkill /IM HeidiSQL.exe /F /T #Compile-LanguageNightlyBuild -languageCode ps-AF Compile-LanguageNightlyBuild -languageCode he Compile-LanguageNightlyBuild -languageCode vi Compile-LanguageNightlyBuild -languageCode es Compile-LanguageNightlyBuild -languageCode yue Compile-LanguageNightlyBuild -languageCode zh-TW Compile-LanguageNightlyBuild -languageCode ur Compile-LanguageNightlyBuild -languageCode hi Compile-LanguageNightlyBuild -languageCode id Get-Job | Wait-Job Get-job | Remove-Job # clean up Compile-LanguageNightlyBuild -languageCode fr Compile-LanguageNightlyBuild -languageCode sv # Compile-LanguageNightlyBuild -languageCode ta Get-Job | Wait-Job Get-job | Remove-Job # clean up Compile-LanguageNightlyBuild -languageCode da Compile-LanguageNightlyBuild -languageCode it Get-Job | Wait-Job Get-job | Remove-Job # clean up taskkill /IM HeidiSQL.exe /F /T Compile-LanguageNightlyBuild -languageCode ms Compile-LanguageNightlyBuild -languageCode nl Compile-LanguageNightlyBuild -languageCode no Get-Job | Wait-Job Get-job | Remove-Job # clean up taskkill /IM HeidiSQL.exe /F /T Compile-LanguageNightlyBuild -languageCode de #Compile-LanguageNightlyBuild -languageCode fa Get-Job | Wait-Job Get-job | Remove-Job # clean up taskkill /IM HeidiSQL.exe /F /T Compile-LanguageNightlyBuild -languageCode ar #Compile-LanguageNightlyBuild -languageCode uk #Compile-LanguageNightlyBuild -languageCode bn Get-Job | Wait-Job Get-job | Remove-Job # clean up Invoke-Sqlcmd -Query "USE tempdb; DBCC FREEPROCCACHE; DBCC DROPCLEANBUFFERS; DBCC FREESYSTEMCACHE ('ALL'); DBCC FREESESSIONCACHE" Compile-LanguageNightlyBuild -languageCode th Compile-LanguageNightlyBuild -languageCode en Get-Job | Wait-Job Get-job | Remove-Job # clean up Compile-LanguageNightlyBuild -languageCode zh-CN Compile-LanguageNightlyBuild -languageCode sq Compile-LanguageNightlyBuild -languageCode af Get-Job | Wait-Job Get-job | Remove-Job # clean up Invoke-Sqlcmd -Query "USE tempdb; DBCC FREEPROCCACHE; DBCC DROPCLEANBUFFERS; DBCC FREESYSTEMCACHE ('ALL'); DBCC FREESESSIONCACHE" Compile-LanguageNightlyBuild -languageCode ru #Compile-LanguageNightlyBuild -languageCode fi Get-Job | Wait-Job Get-job | Remove-Job # clean up Invoke-Sqlcmd -Query "USE tempdb; DBCC FREEPROCCACHE; DBCC DROPCLEANBUFFERS; DBCC FREESYSTEMCACHE ('ALL'); DBCC FREESESSIONCACHE" taskkill /IM HeidiSQL.exe /F /T Compile-LanguageNightlyBuild -languageCode ko Get-Job | Wait-Job Get-job | Remove-Job # clean up Compile-LanguageNightlyBuild -languageCode tr Get-Job | Wait-Job Get-job | Remove-Job # clean up Compile-LanguageNightlyBuild -languageCode pl Compile-LanguageNightlyBuild -languageCode pt Get-Job | Wait-Job Get-job | Remove-Job # clean up Compile-LanguageNightlyBuild -languageCode tl Get-Job | Wait-Job Get-job | Remove-Job # clean up Compile-LanguageNightlyBuild -languageCode ja Get-Job | Wait-Job Get-job | Remove-Job # clean up #Compile-LanguageNightlyBuild -languageCode ro Compile-LanguageNightlyBuild -languageCode hu #Compile-LanguageNightlyBuild -languageCode cz Get-Job | Wait-Job Get-job | Remove-Job # clean up taskkill /IM HeidiSQL.exe /F /T #Compile-LanguageNightlyBuild -languageCode el #Compile-LanguageNightlyBuild -languageCode bg $tisaneDbZipPath = "C:\Tisane" $tisaneDbArchive = "$tisaneDbZipPath\tisane_db.zip" If (Test-Path $tisaneDbArchive){ Remove-Item $tisaneDbArchive } Get-Job | Wait-Job Get-job | Remove-Job # clean up #Compress-Archive -Path C:\Tisane\outdb -DestinationPath $tisaneDbArchive $compressionLevel = [System.IO.Compression.CompressionLevel]::Optimal #[System.IO.Compression.ZipFile]::CreateFromDirectory("C:\Tisane\outdb\", "$tisaneDbArchive", $compressionLevel, $false) $compressionJob = Start-Job -ScriptBlock { Add-Type -Assembly "System.Io.Compression.FileSystem" [System.IO.Compression.ZipFile]::CreateFromDirectory("C:\Tisane\outdb\", "C:\Tisane\tisane_db.zip", [System.IO.Compression.CompressionLevel]::Optimal, $false) } $ftp = "ftp://internal%40tisane.ai:Tisanelabs4now4321@ftp.tisane.ai/nightlyTisaneDB.zip"; Write-Host -Object "ftp url: $ftp"; $webclient = New-Object -TypeName System.Net.WebClient; $uri = New-Object -TypeName System.Uri -ArgumentList $ftp; Write-Host -Object "Uploading $tisaneDbArchive..."; #$webclient.UploadFile($uri, $tisaneDbArchive); # C:\Tisane\tisaneCompiler.exe tests # $webclient.UploadFile($uri, "C:\Tisane\tisaneDB.zip"); Write-Host -Object "Copying to permanent location..."; #(new-object Net.WebClient).DownloadString("https://lampws.tisane.ai/freeTisane"); #(new-object Net.WebClient).DownloadString(""); taskkill /IM Tisane.TestConsole.exe /F taskkill /IM TisaneDBViewer.exe /F Stop-Service "Tisane LaMP *sandbox*" Stop-Service "Tisane LaMP *LaMP*" Stop-Service "SQL Server *MSSQLSERVER*" dir "C:\Tisane\db" | Remove-Item -Recurse -Force -ErrorAction SilentlyContinue Copy-Item -Path C:\Tisane\outdb\* -Destination C:\Tisane\db -recurse -force Start-Service "SQL Server *MSSQLSERVER*" Start-Service "Tisane LaMP *sandbox*" Start-Service "Tisane LaMP *LaMP*" Write-Host -Object "Starting nightly tests..."; [System.GC]::Collect() #$testbedJob = Start-Job -FilePath C:\LaMP\testbedrun\TisaneNightlyTestsOnly.ps1 #$testbedJob | Wait-Job #$testbedJob | Remove-Job C:\LaMP\testbedrun\Testbed.Run.exe standard tl [System.GC]::Collect() C:\LaMP\testbedrun\Testbed.Run.exe standard de [System.GC]::Collect() C:\LaMP\testbedrun\Testbed.Run.exe standard vi [System.GC]::Collect() C:\LaMP\testbedrun\Testbed.Run.exe standard es [System.GC]::Collect() C:\LaMP\testbedrun\Testbed.Run.exe standard hi [System.GC]::Collect() C:\LaMP\testbedrun\Testbed.Run.exe standard id [System.GC]::Collect() C:\LaMP\testbedrun\Testbed.Run.exe standard fa [System.GC]::Collect() C:\LaMP\testbedrun\Testbed.Run.exe standard ru [System.GC]::Collect() C:\LaMP\testbedrun\Testbed.Run.exe standard en [System.GC]::Collect() #C:\LaMP\testbedrun\Testbed.Run.exe standard fi #[System.GC]::Collect() #C:\LaMP\testbedrun\Testbed.Run.exe standard yue [System.GC]::Collect() C:\LaMP\testbedrun\Testbed.Run.exe standard zh-TW [System.GC]::Collect() C:\LaMP\testbedrun\Testbed.Run.exe standard af [System.GC]::Collect() C:\LaMP\testbedrun\Testbed.Run.exe standard it [System.GC]::Collect() C:\LaMP\testbedrun\Testbed.Run.exe standard ko [System.GC]::Collect() C:\LaMP\testbedrun\Testbed.Run.exe standard ms [System.GC]::Collect() C:\LaMP\testbedrun\Testbed.Run.exe standard nl [System.GC]::Collect() C:\LaMP\testbedrun\Testbed.Run.exe standard no [System.GC]::Collect() C:\LaMP\testbedrun\Testbed.Run.exe standard hu [System.GC]::Collect() C:\LaMP\testbedrun\Testbed.Run.exe standard pl [System.GC]::Collect() C:\LaMP\testbedrun\Testbed.Run.exe standard pt [System.GC]::Collect() C:\LaMP\testbedrun\Testbed.Run.exe standard fr [System.GC]::Collect() C:\LaMP\testbedrun\Testbed.Run.exe standard ja [System.GC]::Collect() C:\LaMP\testbedrun\Testbed.Run.exe standard sv [System.GC]::Collect() C:\LaMP\testbedrun\Testbed.Run.exe standard he [System.GC]::Collect() C:\LaMP\testbedrun\Testbed.Run.exe standard th [System.GC]::Collect() C:\LaMP\testbedrun\Testbed.Run.exe standard zh-CN [System.GC]::Collect() C:\LaMP\testbedrun\Testbed.Run.exe standard sq [System.GC]::Collect() C:\LaMP\testbedrun\Testbed.Run.exe standard ar [System.GC]::Collect() C:\LaMP\testbedrun\Testbed.Run.exe standard tr [System.GC]::Collect() C:\LaMP\testbedrun\Testbed.Run.exe standard ur [System.GC]::Collect() C:\LaMP\testbedrun\Testbed.Run.exe standard ps-AF [System.GC]::Collect() Invoke-Sqlcmd -Query "USE tisane; DELETE tr FROM TestResults tr WHERE tr.TestProviderId > 2 AND tr.IsStandard = 0 AND tr.Created < GETUTCDATE() - 1; USE tempdb; DBCC FREEPROCCACHE; DBCC DROPCLEANBUFFERS; DBCC FREESYSTEMCACHE ('ALL'); DBCC FREESESSIONCACHE; DBCC SHRINKDATABASE(tempdb, 10); DBCC shrinkfile ('tempdev'); DBCC shrinkfile ('templog')" # prevent tempdb blowout #C:\Tisane\tisaneCompiler.exe tl spell #C:\Tisane\tisaneCompiler.exe hi spell C:\LaMP\testbedrun\Testbed.Run.exe standard tl 3 C:\LaMP\testbedrun\Testbed.Run.exe standard de 3 C:\LaMP\testbedrun\Testbed.Run.exe standard ru 3 C:\LaMP\testbedrun\Testbed.Run.exe standard es 3 C:\LaMP\testbedrun\Testbed.Run.exe standard hi 3 C:\LaMP\testbedrun\Testbed.Run.exe standard fr 3 C:\LaMP\testbedrun\Testbed.Run.exe standard ko 3 C:\LaMP\testbedrun\Testbed.Run.exe standard it 3 C:\LaMP\testbedrun\Testbed.Run.exe standard af 3 C:\LaMP\testbedrun\Testbed.Run.exe standard ja 3 C:\LaMP\testbedrun\Testbed.Run.exe standard hu 3 C:\LaMP\testbedrun\Testbed.Run.exe standard sq 3 C:\LaMP\testbedrun\Testbed.Run.exe standard nl 3 C:\LaMP\testbedrun\Testbed.Run.exe standard tl 4 C:\LaMP\testbedrun\Testbed.Run.exe standard de 4 C:\LaMP\testbedrun\Testbed.Run.exe standard ru 4 C:\LaMP\testbedrun\Testbed.Run.exe standard es 4 C:\LaMP\testbedrun\Testbed.Run.exe standard hi 4 C:\LaMP\testbedrun\Testbed.Run.exe standard fr 4 C:\LaMP\testbedrun\Testbed.Run.exe standard ko 4 C:\LaMP\testbedrun\Testbed.Run.exe standard it 4 C:\LaMP\testbedrun\Testbed.Run.exe standard af 4 C:\LaMP\testbedrun\Testbed.Run.exe standard ja 4 C:\LaMP\testbedrun\Testbed.Run.exe standard nl 4 $compressionJob | Wait-Job Get-job | Remove-Job # clean up } #EndRegion '.\Public\Tisane-NightlyBuild.ps1' 257 #Region '.\Public\TisaneNightlyTestsOnly.ps1' 0 function Tisane-NightlyTestOnly{ C:\LaMP\testbedrun\Testbed.Run.exe standard ps-AF C:\LaMP\testbedrun\Testbed.Run.exe standard ar C:\LaMP\testbedrun\Testbed.Run.exe standard de C:\LaMP\testbedrun\Testbed.Run.exe standard tr C:\LaMP\testbedrun\Testbed.Run.exe standard vi C:\LaMP\testbedrun\Testbed.Run.exe standard es C:\LaMP\testbedrun\Testbed.Run.exe standard fa C:\LaMP\testbedrun\Testbed.Run.exe standard ru C:\LaMP\testbedrun\Testbed.Run.exe standard fi C:\LaMP\testbedrun\Testbed.Run.exe standard yue C:\LaMP\testbedrun\Testbed.Run.exe standard zh-TW C:\LaMP\testbedrun\Testbed.Run.exe standard da C:\LaMP\testbedrun\Testbed.Run.exe standard ur C:\LaMP\testbedrun\Testbed.Run.exe standard hi C:\LaMP\testbedrun\Testbed.Run.exe standard id C:\LaMP\testbedrun\Testbed.Run.exe standard it C:\LaMP\testbedrun\Testbed.Run.exe standard ko C:\LaMP\testbedrun\Testbed.Run.exe standard ms C:\LaMP\testbedrun\Testbed.Run.exe standard nl C:\LaMP\testbedrun\Testbed.Run.exe standard no C:\LaMP\testbedrun\Testbed.Run.exe standard pl C:\LaMP\testbedrun\Testbed.Run.exe standard pt C:\LaMP\testbedrun\Testbed.Run.exe standard fr C:\LaMP\testbedrun\Testbed.Run.exe standard sv C:\LaMP\testbedrun\Testbed.Run.exe standard ta C:\LaMP\testbedrun\Testbed.Run.exe standard he C:\LaMP\testbedrun\Testbed.Run.exe standard ja C:\LaMP\testbedrun\Testbed.Run.exe standard th C:\LaMP\testbedrun\Testbed.Run.exe standard en C:\LaMP\testbedrun\Testbed.Run.exe standard zh-CN } #EndRegion '.\Public\TisaneNightlyTestsOnly.ps1' 33 #Region '.\Public\Update-WordFrequency.ps1' 0 function Update-WordFrequency{ Param( [Parameter(Mandatory = $true, HelpMessage="language: ")][string] $language, [Parameter(Mandatory = $true, HelpMessage="Path: to wordlist csv")][string] $path, [Parameter(Mandatory = $false, HelpMessage="Database name: ")][string] $tisaneDb ) # $user = Get-LampSetting -settingName 'lampUser' -defaultValue '' if (-not($tisaneDb)) { $tisaneDb = 'tisane' } $user = Get-LampSetting -settingName 'lampUser' -defaultValue '' $transactUser = 'Update-WordFrequency.ps1_'+$user $languageJSON = Invoke-Sqlcmd -Query "USE $tisaneDb; SELECT Id FROM Languages WHERE ISOCode = '$language'" $languageId = $languageJSON.Id # Login-Lamp # $languageJSON = Invoke-RestMethod -Uri "$global:productionHost/languages" -Method GET -UseBasicParsing # $languageJSON | foreach { # if ($_.ISOCode -eq $language) { # $languageId = $_.id # } # } # Set-LampLanguage -languageId $languageId # $path = "$language"+"_graded.csv" $csv = Import-Csv $path $colunNames=$csv[0].psobject.properties.name if (("word" -in $colunNames) -And ("log_grade" -in $colunNames)){ $i=0 foreach($item in $csv){ $word=$($item.word) $grade=$($item.log_grade) # Write-Host "$word with freq $grade" $something = Invoke-Sqlcmd -Query "USE $tisaneDb; UPDATE lexemes SET FrequencyGrade = $grade,LastUpdatedByBatch = '$transactUser', LastBatchUpdate = GETUTCDATE() WHERE LanguageId = $languageId AND MainLemma= N'$word'" $pct = $i / $csv.length * 100 Write-Progress -Activity "Writing lexeme grade " -Status "$pct% $word" -PercentComplete $pct $i+=1 # if ($i -eq 100){ # break # } } } } #EndRegion '.\Public\Update-WordFrequency.ps1' 46 |