From 04110cda451d0b78d9bf92f4764d76bf97bfdb08 Mon Sep 17 00:00:00 2001 From: shaunluedeke Date: Tue, 2 Jun 2026 21:11:36 +0200 Subject: [PATCH] Add support for string, binary, and base64 payload sanitization with tests --- README.md | 51 +++++ src/FileSanitizer.php | 200 +++++++++++++++++- .../FileSanitizerInputStringTest.php | 77 +++++++ 3 files changed, 318 insertions(+), 10 deletions(-) create mode 100644 tests/Sanitizer/FileSanitizerInputStringTest.php diff --git a/README.md b/README.md index eafc1cf..8e3ee32 100644 --- a/README.md +++ b/README.md @@ -57,6 +57,57 @@ if (!$result['scan']->safe) { echo 'Sanitized file written to: ' . $result['sanitize']->outputPath . PHP_EOL; ``` +## String and base64 input + +You can sanitize raw payloads directly without creating input files yourself. + +```php +ok'; +$result = $sanitizer->processString($html, 'upload.html', true); + +echo $result['sanitizedData']; +``` + +`processString()` also accepts optional `filenameHint` and optional `mimeType` as the 2nd and 5th argument. +If `mimeType` is `null`, FileSanitizer detects it from the payload data. +It also supports Python-style bytes literal input like `b"...\\xFF..."` or `b'...\\x00...'`. + +For raw binary bytes, use `processBinary()`: + +```php +processBinary($bytes, null, null, true, null); +``` + +For base64 payloads (including `data:*;base64,...` input): + +```php +'); +$result = $sanitizer->processBase64($payload, 'upload.svg', true); + +echo $result['sanitizedBase64']; +``` + +`processBase64()` also accepts optional `filenameHint` and optional `mimeType` as the 2nd and 5th argument. +If `mimeType` is `null`, it first uses Data-URI MIME (if present), otherwise detects from decoded data. + ## sanitizeAlways mode When `sanitizeAlways` is enabled, FileSanitizer will attempt best-effort sanitization even if risky content is detected during scanning. diff --git a/src/FileSanitizer.php b/src/FileSanitizer.php index 7b3fd4d..d625ecd 100644 --- a/src/FileSanitizer.php +++ b/src/FileSanitizer.php @@ -2,6 +2,7 @@ namespace SytxLabs\FileSanitizer; +use Exception; use RuntimeException; use SytxLabs\FileSanitizer\Contracts\SanitizerInterface; use SytxLabs\FileSanitizer\Contracts\ScannerInterface; @@ -44,6 +45,103 @@ public function process(string $inputPath, bool|string|null $outputPath = null, } $mimeType = ($this->mimeDetector ?? new MimeDetector())->detect($inputPath); + + return $this->processWithMimeType($inputPath, $mimeType, $outputPath, $sanitizeAlways); + } + + /** + * @return array{mimeType:string, scan:ScanReport, sanitize:SanitizeReport} + * @noinspection PhpUnused + */ + public function sanitizeAlways(string $inputPath, ?string $outputPath = null): array + { + return $this->process($inputPath, $outputPath, true); + } + + /** + * @return array{mimeType:string, scan:ScanReport, sanitize:SanitizeReport, sanitizedData:string} + */ + public function processString(string $data, ?string $filenameHint = null, bool|string|null $outputPath = null, bool $sanitizeAlways = false, ?string $mimeType = null): array + { + return $this->processDataInput($data, $filenameHint, $outputPath, $sanitizeAlways, $mimeType); + } + + /** + * @return array{mimeType:string, scan:ScanReport, sanitize:SanitizeReport, sanitizedData:string} + */ + public function processBinary(string $binaryData, ?string $filenameHint = null, bool|string|null $outputPath = null, bool $sanitizeAlways = false, ?string $mimeType = null): array + { + return $this->processDataInput($binaryData, $filenameHint, $outputPath, $sanitizeAlways, $mimeType, false); + } + + /** + * @return array{mimeType:string, scan:ScanReport, sanitize:SanitizeReport, sanitizedData:string} + */ + private function processDataInput(string $data, ?string $filenameHint = null, bool|string|null $outputPath = null, bool $sanitizeAlways = false, ?string $mimeType = null, bool $normalizeStringLiterals = true): array + { + if (is_bool($outputPath)) { + $sanitizeAlways = $outputPath; + $outputPath = null; + } + $normalizedData = $normalizeStringLiterals ? $this->normalizeStringInput($data) : $data; + $resolvedMimeType = $mimeType ?? $this->detectMimeTypeFromData($normalizedData); + $inputPath = $this->createTempInputPath($filenameHint, $resolvedMimeType); + $result = null; + if (file_put_contents($inputPath, $normalizedData) === false) { + throw new RuntimeException('Could not write temporary input file for string processing.'); + } + try { + $result = $this->processWithMimeType($inputPath, $resolvedMimeType, $outputPath, $sanitizeAlways); + $sanitizedData = $this->readFileIfExists($result['sanitize']->outputPath); + return [...$result, 'sanitizedData' => $sanitizedData]; + } finally { + $this->safeUnlink($inputPath); + if (!is_string($outputPath) && isset($result['sanitize']) && is_file($result['sanitize']->outputPath)) { + $this->safeUnlink($result['sanitize']->outputPath); + $this->cleanupEmptyDir(dirname($result['sanitize']->outputPath)); + } + $this->cleanupEmptyDir(dirname($inputPath)); + } + } + + /** + * @return array{mimeType:string, scan:ScanReport, sanitize:SanitizeReport, sanitizedData:string, sanitizedBase64:string} + */ + public function processBase64(string $base64Data, ?string $filenameHint = null, bool|string|null $outputPath = null, bool $sanitizeAlways = false, ?string $mimeType = null): array + { + if (is_bool($outputPath)) { + $sanitizeAlways = $outputPath; + $outputPath = null; + } + $decoded = base64_decode($this->extractBase64Payload($base64Data), true); + if ($decoded === false) { + throw new RuntimeException('Invalid base64 input.'); + } + $result = $this->processBinary($decoded, $filenameHint, $outputPath, $sanitizeAlways, $mimeType ?? $this->extractDataUriMimeType($base64Data)); + return [...$result, 'sanitizedBase64' => base64_encode($result['sanitizedData'])]; + } + + private function resolveSanitizer(string $mimeType, string $path): ?SanitizerInterface + { + foreach ($this->sanitizers as $sanitizer) { + if ($sanitizer->supports($mimeType, $path)) { + return $sanitizer; + } + } + return null; + } + + private function defaultOutputPath(string $inputPath): string + { + $extension = pathinfo($inputPath, PATHINFO_EXTENSION); + return substr($inputPath, 0, -strlen($extension) - ($extension !== '' ? 1 : 0)) . '.sanitized' . ($extension !== '' ? '.' . $extension : ''); + } + + /** + * @return array{mimeType:string, scan:ScanReport, sanitize:SanitizeReport} + */ + private function processWithMimeType(string $inputPath, string $mimeType, bool|string|null $outputPath = null, bool $sanitizeAlways = false): array + { $scan = ($this->scanner ?? new PatternScanner())->scan($inputPath, $mimeType); $outputPath ??= $this->defaultOutputPath($inputPath); if (!$scan->safe && !$sanitizeAlways) { @@ -65,24 +163,106 @@ public function process(string $inputPath, bool|string|null $outputPath = null, return ['mimeType' => $mimeType, 'scan' => $scan, 'sanitize' => $sanitize]; } - public function sanitizeAlways(string $inputPath, ?string $outputPath = null): array + private function createTempInputPath(?string $filenameHint = null, ?string $mimeType = null): string { - return $this->process($inputPath, $outputPath, true); + $fallbackExtension = match (strtolower($mimeType ?? '')) { + 'text/html' => 'html', + 'application/xhtml+xml' => 'xhtml', + 'image/svg+xml' => 'svg', + 'application/json' => 'json', + 'application/xml', 'text/xml' => 'xml', + 'text/plain' => 'txt', + 'application/pdf' => 'pdf', + 'image/jpeg' => 'jpg', + 'image/png' => 'png', + 'image/gif' => 'gif', + 'image/webp' => 'webp', + 'application/zip' => 'zip', + default => null, + }; + $hint = $filenameHint ?? ('upload' . ($fallbackExtension !== null ? '.' . $fallbackExtension : '.bin')); + $safeName = trim(preg_replace('/[^A-Za-z0-9._-]/', '_', basename($hint)) ?? 'upload.bin'); + if ($safeName === '' || $safeName === '.' || $safeName === '..') { + $safeName = 'upload' . ($fallbackExtension !== null ? '.' . $fallbackExtension : '.bin'); + } + try { + $tempDirectory = sys_get_temp_dir() . DIRECTORY_SEPARATOR . 'fsz_data_' . bin2hex(random_bytes(8)); + } catch (Exception $e) { + throw new RuntimeException('Could not generate random directory name for string processing.', previous: $e); + } + if (!mkdir($tempDirectory, 0755, true) && !is_dir($tempDirectory)) { + throw new RuntimeException('Could not create temporary directory for string processing.'); + } + return $tempDirectory . DIRECTORY_SEPARATOR . $safeName; } - private function resolveSanitizer(string $mimeType, string $path): ?SanitizerInterface + private function extractBase64Payload(string $base64Data): string { - foreach ($this->sanitizers as $sanitizer) { - if ($sanitizer->supports($mimeType, $path)) { - return $sanitizer; + $trimmed = trim($base64Data); + if (preg_match('#^data:[^;]+;base64,(.+)$#is', $trimmed, $matches) === 1) { + $trimmed = $matches[1]; + } + return preg_replace('/\s+/', '', $trimmed) ?? $trimmed; + } + + private function extractDataUriMimeType(string $base64Data): ?string + { + if (preg_match('#^\s*data:([^;,\s]+);base64,#i', $base64Data, $matches) !== 1) { + return null; + } + $mimeType = strtolower(trim($matches[1])); + return $mimeType !== '' ? $mimeType : null; + } + + private function detectMimeTypeFromData(string $data): string + { + $finfo = finfo_open(FILEINFO_MIME_TYPE); + if ($finfo !== false) { + $detected = finfo_buffer($finfo, $data); + finfo_close($finfo); + if (is_string($detected) && $detected !== '' && $detected !== 'application/octet-stream') { + return strtolower(trim($detected)); } } - return null; + $trimmed = ltrim($data); + return match (true) { + preg_match('/^%PDF-/i', $trimmed) === 1 => 'application/pdf', + preg_match('/^ 'image/svg+xml', + preg_match('/^<\?xml\b/i', $trimmed) === 1 => 'application/xml', + preg_match('/^ 'text/html', + preg_match('/^\s*[{\[]/', $trimmed) === 1 => 'application/json', + default => 'application/octet-stream', + }; } - private function defaultOutputPath(string $inputPath): string + private function normalizeStringInput(string $data): string { - $extension = pathinfo($inputPath, PATHINFO_EXTENSION); - return substr($inputPath, 0, -strlen($extension) - ($extension !== '' ? 1 : 0)) . '.sanitized' . ($extension !== '' ? '.' . $extension : ''); + if (preg_match('/^b([\"\'])(.*)\1$/is', trim($data), $matches) !== 1) { + return $data; + } + return stripcslashes($matches[2]); + } + + private function readFileIfExists(string $path): string + { + if (!is_file($path)) { + return ''; + } + $content = file_get_contents($path); + return $content !== false ? $content : throw new RuntimeException(sprintf('Could not read output file: %s', $path)); + } + + private function safeUnlink(string $path): void + { + if (is_file($path)) { + @unlink($path); + } + } + + private function cleanupEmptyDir(string $directory): void + { + if (is_dir($directory) && count(scandir($directory) ?: []) === 2) { + @rmdir($directory); + } } } diff --git a/tests/Sanitizer/FileSanitizerInputStringTest.php b/tests/Sanitizer/FileSanitizerInputStringTest.php new file mode 100644 index 0000000..3dc28dc --- /dev/null +++ b/tests/Sanitizer/FileSanitizerInputStringTest.php @@ -0,0 +1,77 @@ +processString('
ok
', 'upload.html', true); + + self::assertArrayHasKey('sanitizedData', $result); + self::assertStringNotContainsString(''); + + $result = $sanitizer->processBase64($payload, 'upload.svg', true); + + self::assertArrayHasKey('sanitizedData', $result); + self::assertArrayHasKey('sanitizedBase64', $result); + self::assertStringNotContainsString('processString($payload, null, true); + + self::assertSame('text/html', $result['mimeType']); + self::assertStringNotContainsString('processString($blobLiteral, null, true); + + self::assertSame('text/html', $result['mimeType']); + self::assertStringNotContainsString('processBinary($pngBytes, null, true, true, 'image/png'); + + self::assertSame('image/png', $result['mimeType']); + self::assertNotSame('', $result['sanitizedData']); + self::assertSame("\x89PNG\r\n\x1A\n", substr($result['sanitizedData'], 0, 8)); + } +}