Page Content: Added more complex & configurable content filtering

- Added new option to control parts of the filter.
- Added whitelist filtering pass via HTMLPurifier.
This commit is contained in:
Dan Brown
2026-02-13 14:14:28 +00:00
parent 46dcc30bf7
commit 10ebe53bd9
10 changed files with 294 additions and 65 deletions

View File

@@ -8,6 +8,7 @@ use BookStack\Permissions\PermissionApplicator;
use BookStack\Users\Models\HasCreatorAndUpdater;
use BookStack\Users\Models\OwnableInterface;
use BookStack\Util\HtmlContentFilter;
use BookStack\Util\HtmlContentFilterConfig;
use Illuminate\Database\Eloquent\Builder;
use Illuminate\Database\Eloquent\Factories\HasFactory;
use Illuminate\Database\Eloquent\Relations\BelongsTo;
@@ -82,7 +83,8 @@ class Comment extends Model implements Loggable, OwnableInterface
public function safeHtml(): string
{
return HtmlContentFilter::removeActiveContentFromHtmlString($this->html ?? '');
$filter = new HtmlContentFilter(new HtmlContentFilterConfig());
return $filter->filterString($this->html ?? '');
}
public function jointPermissions(): HasMany

View File

@@ -42,6 +42,18 @@ return [
// Even when overridden the WYSIWYG editor may still escape script content.
'allow_content_scripts' => env('ALLOW_CONTENT_SCRIPTS', false),
// Control the behaviour of page content filtering.
// This setting is a collection of characters which represent different available filters:
// - j - Filter out JavaScript based content
// - h - Filter out unexpected, potentially dangerous, HTML elements
// - f - Filter out unexpected form elements
// - a - Run content through a more complex allow-list filter
// This defaults to using all filters, unless ALLOW_CONTENT_SCRIPTS is set to true in which case no filters are used.
// Note: These filters are a best attempt, and may not be 100% effective. They are typically a layer used in addition to other security measures.
// TODO - Add to example env
// TODO - Remove allow_content_scripts option above
'content_filtering' => env('CONTENT_FILTERING', env('ALLOW_CONTENT_SCRIPTS', false) === true ? '' : 'jfha'),
// Allow server-side fetches to be performed to potentially unknown
// and user-provided locations. Primarily used in exports when loading
// in externally referenced assets.

View File

@@ -6,6 +6,7 @@ use BookStack\Entities\Models\Book;
use BookStack\Entities\Models\Bookshelf;
use BookStack\Entities\Models\Chapter;
use BookStack\Util\HtmlContentFilter;
use BookStack\Util\HtmlContentFilterConfig;
class EntityHtmlDescription
{
@@ -50,7 +51,8 @@ class EntityHtmlDescription
return $html;
}
return HtmlContentFilter::removeActiveContentFromHtmlString($html);
$filter = new HtmlContentFilter(new HtmlContentFilterConfig());
return $filter->filterString($html);
}
public function getPlain(): string

View File

@@ -13,6 +13,7 @@ use BookStack\Uploads\ImageRepo;
use BookStack\Uploads\ImageService;
use BookStack\Users\Models\User;
use BookStack\Util\HtmlContentFilter;
use BookStack\Util\HtmlContentFilterConfig;
use BookStack\Util\HtmlDocument;
use BookStack\Util\WebSafeMimeSniffer;
use Closure;
@@ -317,11 +318,28 @@ class PageContent
$this->updateIdsRecursively($doc->getBody(), 0, $idMap, $changeMap);
}
if (!config('app.allow_content_scripts')) {
HtmlContentFilter::removeActiveContentFromDocument($doc);
$cacheKey = $this->getContentCacheKey($doc->getBodyInnerHtml());
$cached = cache()->get($cacheKey, null);
if ($cached !== null) {
return $cached;
}
return $doc->getBodyInnerHtml();
$filterConfig = HtmlContentFilterConfig::fromConfigString(config('app.content_filtering'));
$filter = new HtmlContentFilter($filterConfig);
$filtered = $filter->filterDocument($doc);
$cacheTime = 86400 * 7; // 1 week
cache()->put($cacheKey, $filtered, $cacheTime);
return $filtered;
}
protected function getContentCacheKey(string $html): string
{
$contentHash = md5($html);
$contentId = $this->page->id;
$contentTime = $this->page->updated_at->timestamp;
return "page-content-cache::{$contentId}::{$contentTime}::{$contentHash}";
}
/**

View File

@@ -4,25 +4,16 @@ namespace BookStack\Theming;
use BookStack\Util\CspService;
use BookStack\Util\HtmlContentFilter;
use BookStack\Util\HtmlContentFilterConfig;
use BookStack\Util\HtmlNonceApplicator;
use Illuminate\Contracts\Cache\Repository as Cache;
class CustomHtmlHeadContentProvider
{
/**
* @var CspService
*/
protected $cspService;
/**
* @var Cache
*/
protected $cache;
public function __construct(CspService $cspService, Cache $cache)
{
$this->cspService = $cspService;
$this->cache = $cache;
public function __construct(
protected CspService $cspService,
protected Cache $cache
) {
}
/**
@@ -50,7 +41,8 @@ class CustomHtmlHeadContentProvider
$hash = md5($content);
return $this->cache->remember('custom-head-export:' . $hash, 86400, function () use ($content) {
return HtmlContentFilter::removeActiveContentFromHtmlString($content);
$config = new HtmlContentFilterConfig(filterOutNonContentElements: false);
return (new HtmlContentFilter($config))->filterString($content);
});
}

View File

@@ -5,15 +5,53 @@ namespace BookStack\Util;
use DOMAttr;
use DOMElement;
use DOMNodeList;
use HTMLPurifier;
use HTMLPurifier_HTML5Config;
class HtmlContentFilter
{
/**
* Remove all active content from the given HTML document.
* This aims to cover anything which can dynamically deal with, or send, data
* like any JavaScript actions or form content.
*/
public static function removeActiveContentFromDocument(HtmlDocument $doc): void
public function __construct(
protected HtmlContentFilterConfig $config
) {
}
public function filterDocument(HtmlDocument $doc): string
{
if ($this->config->filterOutJavaScript) {
$this->filterOutScriptsFromDocument($doc);
}
if ($this->config->filterOutFormElements) {
$this->filterOutFormElementsFromDocument($doc);
}
if ($this->config->filterOutBadHtmlElements) {
$this->filterOutBadHtmlElementsFromDocument($doc);
}
if ($this->config->filterOutNonContentElements) {
$this->filterOutNonContentElementsFromDocument($doc);
}
$filtered = $doc->getBodyInnerHtml();
if ($this->config->useAllowListFilter) {
$filtered = $this->applyAllowListFiltering($filtered);
}
return $filtered;
}
public function filterString(string $html): string
{
return $this->filterDocument(new HtmlDocument($html));
}
protected function applyAllowListFiltering(string $html): string
{
$config = HTMLPurifier_HTML5Config::createDefault();
$config->set('Cache.SerializerPath', storage_path('purifier'));
$purifier = new HTMLPurifier($config);
return $purifier->purify($html);
}
protected function filterOutScriptsFromDocument(HtmlDocument $doc): void
{
// Remove standard script tags
$scriptElems = $doc->queryXPath('//script');
@@ -27,10 +65,6 @@ class HtmlContentFilter
$badForms = $doc->queryXPath('//*[' . static::xpathContains('@action', 'javascript:') . '] | //*[' . static::xpathContains('@formaction', 'javascript:') . ']');
static::removeNodes($badForms);
// Remove meta tag to prevent external redirects
$metaTags = $doc->queryXPath('//meta[' . static::xpathContains('@content', 'url') . ']');
static::removeNodes($metaTags);
// Remove data or JavaScript iFrames
$badIframes = $doc->queryXPath('//*[' . static::xpathContains('@src', 'data:') . '] | //*[' . static::xpathContains('@src', 'javascript:') . '] | //*[@srcdoc]');
static::removeNodes($badIframes);
@@ -49,7 +83,10 @@ class HtmlContentFilter
// Remove 'on*' attributes
$onAttributes = $doc->queryXPath('//@*[starts-with(name(), \'on\')]');
static::removeAttributes($onAttributes);
}
protected function filterOutFormElementsFromDocument(HtmlDocument $doc): void
{
// Remove form elements
$formElements = ['form', 'fieldset', 'button', 'textarea', 'select'];
foreach ($formElements as $formElement) {
@@ -75,41 +112,21 @@ class HtmlContentFilter
}
}
/**
* Remove active content from the given HTML string.
* This aims to cover anything which can dynamically deal with, or send, data
* like any JavaScript actions or form content.
*/
public static function removeActiveContentFromHtmlString(string $html): string
protected function filterOutBadHtmlElementsFromDocument(HtmlDocument $doc): void
{
if (empty($html)) {
return $html;
// Remove meta tag to prevent external redirects
$metaTags = $doc->queryXPath('//meta[' . static::xpathContains('@content', 'url') . ']');
static::removeNodes($metaTags);
}
protected function filterOutNonContentElementsFromDocument(HtmlDocument $doc): void
{
// Remove non-content elements
$formElements = ['link', 'style', 'meta', 'title', 'template'];
foreach ($formElements as $formElement) {
$matchingFormElements = $doc->queryXPath('//' . $formElement);
static::removeNodes($matchingFormElements);
}
$doc = new HtmlDocument($html);
static::removeActiveContentFromDocument($doc);
return $doc->getBodyInnerHtml();
}
/**
* Alias using the old method name to avoid potential compatibility breaks during patch release.
* To remove in future feature release.
* @deprecated Use removeActiveContentFromDocument instead.
*/
public static function removeScriptsFromDocument(HtmlDocument $doc): void
{
static::removeActiveContentFromDocument($doc);
}
/**
* Alias using the old method name to avoid potential compatibility breaks during patch release.
* To remove in future feature release.
* @deprecated Use removeActiveContentFromHtmlString instead.
*/
public static function removeScriptsFromHtmlString(string $html): string
{
return static::removeActiveContentFromHtmlString($html);
}
/**
@@ -147,4 +164,34 @@ class HtmlContentFilter
$parentNode->removeAttribute($attrName);
}
}
/**
* Alias using the old method name to avoid potential compatibility breaks during patch release.
* To remove in future feature release.
* @deprecated Use filterDocument instead.
*/
public static function removeScriptsFromDocument(HtmlDocument $doc): void
{
$config = new HtmlContentFilterConfig(
filterOutNonContentElements: false,
useAllowListFilter: false,
);
$filter = new static($config);
$filter->filterDocument($doc);
}
/**
* Alias using the old method name to avoid potential compatibility breaks during patch release.
* To remove in future feature release.
* @deprecated Use filterString instead.
*/
public static function removeScriptsFromHtmlString(string $html): string
{
$config = new HtmlContentFilterConfig(
filterOutNonContentElements: false,
useAllowListFilter: false,
);
$filter = new static($config);
return $filter->filterString($html);
}
}

View File

@@ -0,0 +1,31 @@
<?php
namespace BookStack\Util;
readonly class HtmlContentFilterConfig
{
public function __construct(
public bool $filterOutJavaScript = true,
public bool $filterOutBadHtmlElements = true,
public bool $filterOutFormElements = true,
public bool $filterOutNonContentElements = true,
public bool $useAllowListFilter = true,
) {
}
/**
* Create an instance from a config string, where the string
* is a combination of characters to enable filters.
*/
public static function fromConfigString(string $config): self
{
$config = strtolower($config);
return new self(
filterOutJavaScript: str_contains($config, 'j'),
filterOutBadHtmlElements: str_contains($config, 'h'),
filterOutFormElements: str_contains($config, 'f'),
filterOutNonContentElements: str_contains($config, 'h'),
useAllowListFilter: str_contains($config, 'a'),
);
}
}

View File

@@ -19,6 +19,7 @@
"ext-zip": "*",
"bacon/bacon-qr-code": "^3.0",
"dompdf/dompdf": "^3.1",
"ezyang/htmlpurifier": "^4.19",
"guzzlehttp/guzzle": "^7.4",
"intervention/image": "^3.5",
"knplabs/knp-snappy": "^1.5",
@@ -38,7 +39,8 @@
"socialiteproviders/microsoft-azure": "^5.1",
"socialiteproviders/okta": "^4.2",
"socialiteproviders/twitch": "^5.3",
"ssddanbrown/htmldiff": "^2.0.0"
"ssddanbrown/htmldiff": "^2.0.0",
"xemlock/htmlpurifier-html5": "^0.1.12"
},
"require-dev": {
"fakerphp/faker": "^1.21",

123
composer.lock generated
View File

@@ -4,7 +4,7 @@
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
"This file is @generated automatically"
],
"content-hash": "556613432c8fb7d8f96bcf637c8c07a9",
"content-hash": "8dc695e5ecb6cea01e282394da136713",
"packages": [
{
"name": "aws/aws-crt-php",
@@ -919,6 +919,67 @@
],
"time": "2025-03-06T22:45:56+00:00"
},
{
"name": "ezyang/htmlpurifier",
"version": "v4.19.0",
"source": {
"type": "git",
"url": "https://github.com/ezyang/htmlpurifier.git",
"reference": "b287d2a16aceffbf6e0295559b39662612b77fcf"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/ezyang/htmlpurifier/zipball/b287d2a16aceffbf6e0295559b39662612b77fcf",
"reference": "b287d2a16aceffbf6e0295559b39662612b77fcf",
"shasum": ""
},
"require": {
"php": "~5.6.0 || ~7.0.0 || ~7.1.0 || ~7.2.0 || ~7.3.0 || ~7.4.0 || ~8.0.0 || ~8.1.0 || ~8.2.0 || ~8.3.0 || ~8.4.0 || ~8.5.0"
},
"require-dev": {
"cerdic/css-tidy": "^1.7 || ^2.0",
"simpletest/simpletest": "dev-master"
},
"suggest": {
"cerdic/css-tidy": "If you want to use the filter 'Filter.ExtractStyleBlocks'.",
"ext-bcmath": "Used for unit conversion and imagecrash protection",
"ext-iconv": "Converts text to and from non-UTF-8 encodings",
"ext-tidy": "Used for pretty-printing HTML"
},
"type": "library",
"autoload": {
"files": [
"library/HTMLPurifier.composer.php"
],
"psr-0": {
"HTMLPurifier": "library/"
},
"exclude-from-classmap": [
"/library/HTMLPurifier/Language/"
]
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"LGPL-2.1-or-later"
],
"authors": [
{
"name": "Edward Z. Yang",
"email": "admin@htmlpurifier.org",
"homepage": "http://ezyang.com"
}
],
"description": "Standards compliant HTML filter written in PHP",
"homepage": "http://htmlpurifier.org/",
"keywords": [
"html"
],
"support": {
"issues": "https://github.com/ezyang/htmlpurifier/issues",
"source": "https://github.com/ezyang/htmlpurifier/tree/v4.19.0"
},
"time": "2025-10-17T16:34:55+00:00"
},
{
"name": "firebase/php-jwt",
"version": "v7.0.2",
@@ -8279,6 +8340,66 @@
}
],
"time": "2024-11-21T01:49:47+00:00"
},
{
"name": "xemlock/htmlpurifier-html5",
"version": "v0.1.12",
"source": {
"type": "git",
"url": "https://github.com/xemlock/htmlpurifier-html5.git",
"reference": "535349cb160bf79752920e1e83c4a94c3e7d2b21"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/xemlock/htmlpurifier-html5/zipball/535349cb160bf79752920e1e83c4a94c3e7d2b21",
"reference": "535349cb160bf79752920e1e83c4a94c3e7d2b21",
"shasum": ""
},
"require": {
"ezyang/htmlpurifier": "^4.8",
"php": ">=5.3"
},
"require-dev": {
"masterminds/html5": "^2.7",
"php-coveralls/php-coveralls": "^1.1|^2.1",
"phpunit/phpunit": ">=4.7 <10.0"
},
"suggest": {
"masterminds/html5": "Required to use HTMLPurifier_Lexer_HTML5"
},
"type": "library",
"autoload": {
"classmap": [
"library/HTMLPurifier/"
]
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"MIT"
],
"authors": [
{
"name": "xemlock",
"email": "xemlock@gmail.com"
}
],
"description": "HTML5 support for HTML Purifier",
"homepage": "https://github.com/xemlock/htmlpurifier-html5",
"keywords": [
"HTML5",
"Purifier",
"html",
"htmlpurifier",
"security",
"tidy",
"validator",
"xss"
],
"support": {
"issues": "https://github.com/xemlock/htmlpurifier-html5/issues",
"source": "https://github.com/xemlock/htmlpurifier-html5/tree/v0.1.12"
},
"time": "2026-02-09T21:03:14+00:00"
}
],
"packages-dev": [

2
storage/purifier/.gitignore vendored Normal file
View File

@@ -0,0 +1,2 @@
*
!.gitignore