workflows/default/hooks/verify/00-privacy-scan.ps1
|
param( [string]$TaskId, [string]$Category, [switch]$StagedOnly ) # Scan repo for sensitive data before commit $issues = @() $details = @{ files_scanned = 0 violations = @() } # Patterns to detect $patterns = @( # Local paths (Windows/macOS/Linux) @{ name = "windows_user_path"; pattern = '[A-Za-z]:[/\\]+Users[/\\]+\w+'; description = "Windows user path"; caseSensitive = $false } @{ name = "linux_home_path"; pattern = '/home/\w+'; description = "Linux home path"; caseSensitive = $true } @{ name = "macos_user_path"; pattern = '/Users/\w+'; description = "macOS user path"; caseSensitive = $true } # Secrets and credentials @{ name = "api_key_value"; pattern = '(?:api[_-]?key|apikey)\s*[=:]\s*["\u0027]?[A-Za-z0-9_\-]{20,}'; description = "API key value"; caseSensitive = $false } @{ name = "secret_value"; pattern = '(?:secret|password|passwd|pwd)\s*[=:]\s*["\u0027]?[^\s"]{8,}'; description = "Secret/password value"; caseSensitive = $false } @{ name = "bearer_token"; pattern = 'Bearer\s+[A-Za-z0-9_\-\.]{20,}'; description = "Bearer token"; caseSensitive = $false } @{ name = "connection_string"; pattern = '(?:Server|Data Source|mongodb\+srv|postgresql|mysql)://[^\s"]+'; description = "Connection string"; caseSensitive = $false } @{ name = "private_key"; pattern = '-----BEGIN (?:RSA |EC |DSA |OPENSSH )?PRIVATE KEY-----'; description = "Private key"; caseSensitive = $true } @{ name = "connection_string_password"; pattern = '(?:Password|Pwd)\s*=\s*[^\s;]{4,}'; description = "Connection string with password"; caseSensitive = $false } # Cloud credentials @{ name = "aws_key"; pattern = 'AKIA[0-9A-Z]{16}'; description = "AWS access key"; caseSensitive = $true } @{ name = "azure_key"; pattern = '(?:AccountKey|SharedAccessSignature)\s*=\s*[A-Za-z0-9+/=]{40,}'; description = "Azure key"; caseSensitive = $false } ) # Files/paths to exclude from scanning $excludePatterns = @( '\.git[/\\]', 'node_modules[/\\]', '\.vs[/\\]', 'bin[/\\]', 'obj[/\\]', '\.bot[/\\]\.control[/\\]', '\.bot[/\\]hooks[/\\]', '\.bot[/\\]systems[/\\]', '\.bot[/\\]defaults[/\\]', '\.bot[/\\]prompts[/\\]' ) # Binary extensions to skip $binaryExtensions = @('.exe','.dll','.pdb','.zip','.tar','.gz','.7z','.rar', '.png','.jpg','.jpeg','.gif','.bmp','.ico','.svg','.webp', '.mp3','.mp4','.wav','.avi','.mov', '.woff','.woff2','.ttf','.eot', '.pdf','.doc','.docx','.xls','.xlsx', '.pyc','.class','.o','.so','.dylib','.nupkg','.snupkg') # Max file size to scan (skip large files) $maxFileSize = 1MB $repoRoot = git rev-parse --show-toplevel 2>$null if (-not $repoRoot) { $repoRoot = Get-Location } if ($StagedOnly) { # Pre-commit mode: only scan files being committed $allFiles = @(git -C $repoRoot diff --cached --name-only --diff-filter=ACM 2>$null) | Where-Object { $_ } } else { # Full repo scan $trackedFiles = git -C $repoRoot ls-files 2>$null $untrackedFiles = git -C $repoRoot ls-files --others --exclude-standard 2>$null $allFiles = @($trackedFiles) + @($untrackedFiles) | Where-Object { $_ } | Sort-Object -Unique } foreach ($relativePath in $allFiles) { $fullPath = Join-Path $repoRoot $relativePath # Skip excluded paths $skip = $false foreach ($exclude in $excludePatterns) { if ($relativePath -match $exclude) { $skip = $true break } } if ($skip) { continue } # Skip binary extensions $ext = [System.IO.Path]::GetExtension($relativePath).ToLowerInvariant() if ($ext -and $ext -in $binaryExtensions) { continue } # Skip files that don't exist or exceed size limit if (-not (Test-Path $fullPath)) { continue } $fileInfo = Get-Item $fullPath -ErrorAction SilentlyContinue if (-not $fileInfo -or $fileInfo.Length -gt $maxFileSize) { continue } $details['files_scanned']++ $content = Get-Content $fullPath -Raw -ErrorAction SilentlyContinue if (-not $content) { continue } $lineNumber = 0 $lines = $content -split "`n" foreach ($line in $lines) { $lineNumber++ foreach ($patternDef in $patterns) { # Check if pattern matches (case-sensitive or case-insensitive) $matches = if ($patternDef.caseSensitive) { $line -cmatch $patternDef.pattern } else { $line -match $patternDef.pattern } if ($matches -and $line -notmatch '(?://|#)\s*noscan') { $violation = @{ file = $relativePath line = $lineNumber pattern = $patternDef.name description = $patternDef.description snippet = if ($line.Length -gt 100) { $line.Substring(0, 100) + "..." } else { $line.Trim() } } $details['violations'] += $violation $issues += @{ issue = "$($patternDef.description) in $relativePath`:$lineNumber" severity = "error" context = "Remove or redact sensitive data before committing" } } } } } # Deduplicate issues (same file/line can match multiple patterns) $uniqueIssues = $issues | Sort-Object { "$($_.issue)" } -Unique $details['scan_mode'] = if ($StagedOnly) { 'staged' } else { 'full' } if ($StagedOnly -and $uniqueIssues.Count -gt 0) { [Console]::Error.WriteLine("") [Console]::Error.WriteLine("dotbot privacy scan: $($uniqueIssues.Count) violation(s) in staged files:") foreach ($v in $details['violations']) { [Console]::Error.WriteLine(" $($v.file):$($v.line) - $($v.description)") } [Console]::Error.WriteLine("") [Console]::Error.WriteLine("Remove or redact sensitive data before committing.") } @{ success = ($uniqueIssues.Count -eq 0) script = "00-privacy-scan.ps1" message = if ($uniqueIssues.Count -eq 0) { "No sensitive data detected" } else { "$($uniqueIssues.Count) privacy violation(s) found" } details = $details failures = @($uniqueIssues) } | ConvertTo-Json -Depth 10 |