Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 51 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,57 @@ if (!$result['scan']->safe) {
echo 'Sanitized file written to: ' . $result['sanitize']->outputPath . PHP_EOL;
```

## String and base64 input

You can sanitize raw payloads directly without creating input files yourself.

```php
<?php

use SytxLabs\FileSanitizer\FileSanitizer;

$sanitizer = new FileSanitizer();

$html = '<div onclick="x()"><script>alert(1)</script>ok</div>';
$result = $sanitizer->processString($html, 'upload.html', true);

echo $result['sanitizedData'];
```

`processString()` also accepts optional `filenameHint` and optional `mimeType` as the 2nd and 5th argument.
If `mimeType` is `null`, FileSanitizer detects it from the payload data.
It also supports Python-style bytes literal input like `b"...\\xFF..."` or `b'...\\x00...'`.

For raw binary bytes, use `processBinary()`:

```php
<?php

use SytxLabs\FileSanitizer\FileSanitizer;

$sanitizer = new FileSanitizer();
$bytes = file_get_contents('php://input');
$result = $sanitizer->processBinary($bytes, null, null, true, null);
```

For base64 payloads (including `data:*;base64,...` input):

```php
<?php

use SytxLabs\FileSanitizer\FileSanitizer;

$sanitizer = new FileSanitizer();

$payload = 'data:image/svg+xml;base64,' . base64_encode('<svg><script>alert(1)</script></svg>');
$result = $sanitizer->processBase64($payload, 'upload.svg', true);

echo $result['sanitizedBase64'];
```

`processBase64()` also accepts optional `filenameHint` and optional `mimeType` as the 2nd and 5th argument.
If `mimeType` is `null`, it first uses Data-URI MIME (if present), otherwise detects from decoded data.

## sanitizeAlways mode

When `sanitizeAlways` is enabled, FileSanitizer will attempt best-effort sanitization even if risky content is detected during scanning.
Expand Down
200 changes: 190 additions & 10 deletions src/FileSanitizer.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

namespace SytxLabs\FileSanitizer;

use Exception;
use RuntimeException;
use SytxLabs\FileSanitizer\Contracts\SanitizerInterface;
use SytxLabs\FileSanitizer\Contracts\ScannerInterface;
Expand Down Expand Up @@ -44,6 +45,103 @@ public function process(string $inputPath, bool|string|null $outputPath = null,
}

$mimeType = ($this->mimeDetector ?? new MimeDetector())->detect($inputPath);

return $this->processWithMimeType($inputPath, $mimeType, $outputPath, $sanitizeAlways);
}

/**
* @return array{mimeType:string, scan:ScanReport, sanitize:SanitizeReport}
* @noinspection PhpUnused
*/
public function sanitizeAlways(string $inputPath, ?string $outputPath = null): array
{
return $this->process($inputPath, $outputPath, true);
}

/**
* @return array{mimeType:string, scan:ScanReport, sanitize:SanitizeReport, sanitizedData:string}
*/
public function processString(string $data, ?string $filenameHint = null, bool|string|null $outputPath = null, bool $sanitizeAlways = false, ?string $mimeType = null): array
{
return $this->processDataInput($data, $filenameHint, $outputPath, $sanitizeAlways, $mimeType);
}

/**
* @return array{mimeType:string, scan:ScanReport, sanitize:SanitizeReport, sanitizedData:string}
*/
public function processBinary(string $binaryData, ?string $filenameHint = null, bool|string|null $outputPath = null, bool $sanitizeAlways = false, ?string $mimeType = null): array
{
return $this->processDataInput($binaryData, $filenameHint, $outputPath, $sanitizeAlways, $mimeType, false);
}

/**
* @return array{mimeType:string, scan:ScanReport, sanitize:SanitizeReport, sanitizedData:string}
*/
private function processDataInput(string $data, ?string $filenameHint = null, bool|string|null $outputPath = null, bool $sanitizeAlways = false, ?string $mimeType = null, bool $normalizeStringLiterals = true): array
{
if (is_bool($outputPath)) {
$sanitizeAlways = $outputPath;
$outputPath = null;
}
$normalizedData = $normalizeStringLiterals ? $this->normalizeStringInput($data) : $data;
$resolvedMimeType = $mimeType ?? $this->detectMimeTypeFromData($normalizedData);
$inputPath = $this->createTempInputPath($filenameHint, $resolvedMimeType);
$result = null;
if (file_put_contents($inputPath, $normalizedData) === false) {
throw new RuntimeException('Could not write temporary input file for string processing.');
}
try {
$result = $this->processWithMimeType($inputPath, $resolvedMimeType, $outputPath, $sanitizeAlways);
$sanitizedData = $this->readFileIfExists($result['sanitize']->outputPath);
return [...$result, 'sanitizedData' => $sanitizedData];
} finally {
$this->safeUnlink($inputPath);
if (!is_string($outputPath) && isset($result['sanitize']) && is_file($result['sanitize']->outputPath)) {
$this->safeUnlink($result['sanitize']->outputPath);
$this->cleanupEmptyDir(dirname($result['sanitize']->outputPath));
}
$this->cleanupEmptyDir(dirname($inputPath));
}
}

/**
* @return array{mimeType:string, scan:ScanReport, sanitize:SanitizeReport, sanitizedData:string, sanitizedBase64:string}
*/
public function processBase64(string $base64Data, ?string $filenameHint = null, bool|string|null $outputPath = null, bool $sanitizeAlways = false, ?string $mimeType = null): array
{
if (is_bool($outputPath)) {
$sanitizeAlways = $outputPath;
$outputPath = null;
}
$decoded = base64_decode($this->extractBase64Payload($base64Data), true);
if ($decoded === false) {
throw new RuntimeException('Invalid base64 input.');
}
$result = $this->processBinary($decoded, $filenameHint, $outputPath, $sanitizeAlways, $mimeType ?? $this->extractDataUriMimeType($base64Data));
return [...$result, 'sanitizedBase64' => base64_encode($result['sanitizedData'])];
}

private function resolveSanitizer(string $mimeType, string $path): ?SanitizerInterface
{
foreach ($this->sanitizers as $sanitizer) {
if ($sanitizer->supports($mimeType, $path)) {
return $sanitizer;
}
}
return null;
}

private function defaultOutputPath(string $inputPath): string
{
$extension = pathinfo($inputPath, PATHINFO_EXTENSION);
return substr($inputPath, 0, -strlen($extension) - ($extension !== '' ? 1 : 0)) . '.sanitized' . ($extension !== '' ? '.' . $extension : '');
Comment thread
shaunluedeke marked this conversation as resolved.
}

/**
* @return array{mimeType:string, scan:ScanReport, sanitize:SanitizeReport}
*/
private function processWithMimeType(string $inputPath, string $mimeType, bool|string|null $outputPath = null, bool $sanitizeAlways = false): array
{
$scan = ($this->scanner ?? new PatternScanner())->scan($inputPath, $mimeType);
$outputPath ??= $this->defaultOutputPath($inputPath);
if (!$scan->safe && !$sanitizeAlways) {
Expand All @@ -65,24 +163,106 @@ public function process(string $inputPath, bool|string|null $outputPath = null,
return ['mimeType' => $mimeType, 'scan' => $scan, 'sanitize' => $sanitize];
}

public function sanitizeAlways(string $inputPath, ?string $outputPath = null): array
private function createTempInputPath(?string $filenameHint = null, ?string $mimeType = null): string
{
return $this->process($inputPath, $outputPath, true);
$fallbackExtension = match (strtolower($mimeType ?? '')) {
'text/html' => 'html',
'application/xhtml+xml' => 'xhtml',
'image/svg+xml' => 'svg',
'application/json' => 'json',
'application/xml', 'text/xml' => 'xml',
'text/plain' => 'txt',
'application/pdf' => 'pdf',
'image/jpeg' => 'jpg',
'image/png' => 'png',
'image/gif' => 'gif',
'image/webp' => 'webp',
'application/zip' => 'zip',
default => null,
};
$hint = $filenameHint ?? ('upload' . ($fallbackExtension !== null ? '.' . $fallbackExtension : '.bin'));
$safeName = trim(preg_replace('/[^A-Za-z0-9._-]/', '_', basename($hint)) ?? 'upload.bin');
if ($safeName === '' || $safeName === '.' || $safeName === '..') {
$safeName = 'upload' . ($fallbackExtension !== null ? '.' . $fallbackExtension : '.bin');
}
try {
$tempDirectory = sys_get_temp_dir() . DIRECTORY_SEPARATOR . 'fsz_data_' . bin2hex(random_bytes(8));
} catch (Exception $e) {
throw new RuntimeException('Could not generate random directory name for string processing.', previous: $e);
}
if (!mkdir($tempDirectory, 0755, true) && !is_dir($tempDirectory)) {
throw new RuntimeException('Could not create temporary directory for string processing.');
}
Comment thread
shaunluedeke marked this conversation as resolved.
return $tempDirectory . DIRECTORY_SEPARATOR . $safeName;
}

private function resolveSanitizer(string $mimeType, string $path): ?SanitizerInterface
private function extractBase64Payload(string $base64Data): string
{
foreach ($this->sanitizers as $sanitizer) {
if ($sanitizer->supports($mimeType, $path)) {
return $sanitizer;
$trimmed = trim($base64Data);
if (preg_match('#^data:[^;]+;base64,(.+)$#is', $trimmed, $matches) === 1) {
$trimmed = $matches[1];
}
return preg_replace('/\s+/', '', $trimmed) ?? $trimmed;
}

private function extractDataUriMimeType(string $base64Data): ?string
{
if (preg_match('#^\s*data:([^;,\s]+);base64,#i', $base64Data, $matches) !== 1) {
return null;
}
$mimeType = strtolower(trim($matches[1]));
return $mimeType !== '' ? $mimeType : null;
Comment thread
shaunluedeke marked this conversation as resolved.
}

private function detectMimeTypeFromData(string $data): string
{
$finfo = finfo_open(FILEINFO_MIME_TYPE);
if ($finfo !== false) {
$detected = finfo_buffer($finfo, $data);
finfo_close($finfo);
if (is_string($detected) && $detected !== '' && $detected !== 'application/octet-stream') {
return strtolower(trim($detected));
}
}
return null;
$trimmed = ltrim($data);
return match (true) {
preg_match('/^%PDF-/i', $trimmed) === 1 => 'application/pdf',
preg_match('/^<!DOCTYPE\s+svg\b/i', $trimmed) === 1 || preg_match('/^<svg\b/i', $trimmed) === 1 => 'image/svg+xml',
preg_match('/^<\?xml\b/i', $trimmed) === 1 => 'application/xml',
preg_match('/^<html\b/i', $trimmed) === 1 || preg_match('/^<!doctype\s+html\b/i', $trimmed) === 1 => 'text/html',
preg_match('/^\s*[{\[]/', $trimmed) === 1 => 'application/json',
default => 'application/octet-stream',
};
}

private function defaultOutputPath(string $inputPath): string
private function normalizeStringInput(string $data): string
{
$extension = pathinfo($inputPath, PATHINFO_EXTENSION);
return substr($inputPath, 0, -strlen($extension) - ($extension !== '' ? 1 : 0)) . '.sanitized' . ($extension !== '' ? '.' . $extension : '');
if (preg_match('/^b([\"\'])(.*)\1$/is', trim($data), $matches) !== 1) {
return $data;
}
return stripcslashes($matches[2]);
}

private function readFileIfExists(string $path): string
{
if (!is_file($path)) {
return '';
}
$content = file_get_contents($path);
return $content !== false ? $content : throw new RuntimeException(sprintf('Could not read output file: %s', $path));
}

private function safeUnlink(string $path): void
{
if (is_file($path)) {
@unlink($path);
}
}

private function cleanupEmptyDir(string $directory): void
{
if (is_dir($directory) && count(scandir($directory) ?: []) === 2) {
@rmdir($directory);
}
}
}
77 changes: 77 additions & 0 deletions tests/Sanitizer/FileSanitizerInputStringTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
<?php

namespace SytxLabs\FileSanitizer\Tests\Sanitizer;

use PHPUnit\Framework\TestCase;
use SytxLabs\FileSanitizer\FileSanitizer;

final class FileSanitizerInputStringTest extends TestCase
{
public function testProcessStringSanitizesHtmlPayloadAndReturnsSanitizedData(): void
{
$sanitizer = new FileSanitizer();
$result = $sanitizer->processString('<div onclick="x()"><script>alert(1)</script>ok</div>', 'upload.html', true);

self::assertArrayHasKey('sanitizedData', $result);
self::assertStringNotContainsString('<script', strtolower($result['sanitizedData']));
self::assertStringNotContainsString('onclick=', strtolower($result['sanitizedData']));
self::assertStringContainsString('ok', $result['sanitizedData']);
}

public function testProcessBase64SanitizesDataUriAndReturnsSanitizedBase64(): void
{
$sanitizer = new FileSanitizer();
$payload = 'data:image/svg+xml;base64,' . base64_encode('<svg xmlns="http://www.w3.org/2000/svg"><script>alert(1)</script><circle cx="5" cy="5" r="5"/></svg>');

$result = $sanitizer->processBase64($payload, 'upload.svg', true);

self::assertArrayHasKey('sanitizedData', $result);
self::assertArrayHasKey('sanitizedBase64', $result);
self::assertStringNotContainsString('<script', strtolower($result['sanitizedData']));
self::assertSame($result['sanitizedData'], base64_decode($result['sanitizedBase64'], true));
}

public function testProcessStringDetectsMimeTypeFromDataWhenHintIsNull(): void
{
$sanitizer = new FileSanitizer();
$payload = '<!doctype html><div onclick="x()"><script>alert(1)</script>safe</div>';

$result = $sanitizer->processString($payload, null, true);

self::assertSame('text/html', $result['mimeType']);
self::assertStringNotContainsString('<script', strtolower($result['sanitizedData']));
self::assertStringNotContainsString('onclick=', strtolower($result['sanitizedData']));
self::assertStringContainsString('safe', $result['sanitizedData']);
}

public function testProcessStringSupportsPythonStyleBytesLiteralBlob(): void
{
$sanitizer = new FileSanitizer();
$blobLiteral = 'b"<!doctype html><div onclick=\\"x()\\"><script>alert(1)</script>blob</div>"';

$result = $sanitizer->processString($blobLiteral, null, true);

self::assertSame('text/html', $result['mimeType']);
self::assertStringNotContainsString('<script', strtolower($result['sanitizedData']));
self::assertStringNotContainsString('onclick=', strtolower($result['sanitizedData']));
self::assertStringContainsString('blob', $result['sanitizedData']);
}

public function testProcessBinarySupportsRawImageBytes(): void
{
$sanitizer = new FileSanitizer();
$image = imagecreatetruecolor(1, 1);
self::assertNotFalse($image);
ob_start();
imagepng($image);
imagedestroy($image);
$pngBytes = ob_get_clean();
self::assertIsString($pngBytes);

$result = $sanitizer->processBinary($pngBytes, null, true, true, 'image/png');

self::assertSame('image/png', $result['mimeType']);
self::assertNotSame('', $result['sanitizedData']);
self::assertSame("\x89PNG\r\n\x1A\n", substr($result['sanitizedData'], 0, 8));
}
}