From 10ebe53bd9c9f086efaa35eecd3656f1a201d84d Mon Sep 17 00:00:00 2001 From: Dan Brown Date: Fri, 13 Feb 2026 14:14:28 +0000 Subject: [PATCH] Page Content: Added more complex & configurable content filtering - Added new option to control parts of the filter. - Added whitelist filtering pass via HTMLPurifier. --- app/Activity/Models/Comment.php | 4 +- app/Config/app.php | 12 ++ app/Entities/Tools/EntityHtmlDescription.php | 4 +- app/Entities/Tools/PageContent.php | 24 +++- app/Theming/CustomHtmlHeadContentProvider.php | 22 +-- app/Util/HtmlContentFilter.php | 133 ++++++++++++------ app/Util/HtmlContentFilterConfig.php | 31 ++++ composer.json | 4 +- composer.lock | 123 +++++++++++++++- storage/purifier/.gitignore | 2 + 10 files changed, 294 insertions(+), 65 deletions(-) create mode 100644 app/Util/HtmlContentFilterConfig.php create mode 100644 storage/purifier/.gitignore diff --git a/app/Activity/Models/Comment.php b/app/Activity/Models/Comment.php index ce05e3df3..ab7d91772 100644 --- a/app/Activity/Models/Comment.php +++ b/app/Activity/Models/Comment.php @@ -8,6 +8,7 @@ use BookStack\Permissions\PermissionApplicator; use BookStack\Users\Models\HasCreatorAndUpdater; use BookStack\Users\Models\OwnableInterface; use BookStack\Util\HtmlContentFilter; +use BookStack\Util\HtmlContentFilterConfig; use Illuminate\Database\Eloquent\Builder; use Illuminate\Database\Eloquent\Factories\HasFactory; use Illuminate\Database\Eloquent\Relations\BelongsTo; @@ -82,7 +83,8 @@ class Comment extends Model implements Loggable, OwnableInterface public function safeHtml(): string { - return HtmlContentFilter::removeActiveContentFromHtmlString($this->html ?? ''); + $filter = new HtmlContentFilter(new HtmlContentFilterConfig()); + return $filter->filterString($this->html ?? ''); } public function jointPermissions(): HasMany diff --git a/app/Config/app.php b/app/Config/app.php index 40e542d3e..acd27e98c 100644 --- a/app/Config/app.php +++ b/app/Config/app.php @@ -42,6 +42,18 @@ return [ // Even when overridden the WYSIWYG editor may still escape script content. 'allow_content_scripts' => env('ALLOW_CONTENT_SCRIPTS', false), + // Control the behaviour of page content filtering. + // This setting is a collection of characters which represent different available filters: + // - j - Filter out JavaScript based content + // - h - Filter out unexpected, potentially dangerous, HTML elements + // - f - Filter out unexpected form elements + // - a - Run content through a more complex allow-list filter + // This defaults to using all filters, unless ALLOW_CONTENT_SCRIPTS is set to true in which case no filters are used. + // Note: These filters are a best attempt, and may not be 100% effective. They are typically a layer used in addition to other security measures. + // TODO - Add to example env + // TODO - Remove allow_content_scripts option above + 'content_filtering' => env('CONTENT_FILTERING', env('ALLOW_CONTENT_SCRIPTS', false) === true ? '' : 'jfha'), + // Allow server-side fetches to be performed to potentially unknown // and user-provided locations. Primarily used in exports when loading // in externally referenced assets. diff --git a/app/Entities/Tools/EntityHtmlDescription.php b/app/Entities/Tools/EntityHtmlDescription.php index b14deb257..6bbfb9b66 100644 --- a/app/Entities/Tools/EntityHtmlDescription.php +++ b/app/Entities/Tools/EntityHtmlDescription.php @@ -6,6 +6,7 @@ use BookStack\Entities\Models\Book; use BookStack\Entities\Models\Bookshelf; use BookStack\Entities\Models\Chapter; use BookStack\Util\HtmlContentFilter; +use BookStack\Util\HtmlContentFilterConfig; class EntityHtmlDescription { @@ -50,7 +51,8 @@ class EntityHtmlDescription return $html; } - return HtmlContentFilter::removeActiveContentFromHtmlString($html); + $filter = new HtmlContentFilter(new HtmlContentFilterConfig()); + return $filter->filterString($html); } public function getPlain(): string diff --git a/app/Entities/Tools/PageContent.php b/app/Entities/Tools/PageContent.php index 5358e8f0c..ca06e6961 100644 --- a/app/Entities/Tools/PageContent.php +++ b/app/Entities/Tools/PageContent.php @@ -13,6 +13,7 @@ use BookStack\Uploads\ImageRepo; use BookStack\Uploads\ImageService; use BookStack\Users\Models\User; use BookStack\Util\HtmlContentFilter; +use BookStack\Util\HtmlContentFilterConfig; use BookStack\Util\HtmlDocument; use BookStack\Util\WebSafeMimeSniffer; use Closure; @@ -317,11 +318,28 @@ class PageContent $this->updateIdsRecursively($doc->getBody(), 0, $idMap, $changeMap); } - if (!config('app.allow_content_scripts')) { - HtmlContentFilter::removeActiveContentFromDocument($doc); + $cacheKey = $this->getContentCacheKey($doc->getBodyInnerHtml()); + $cached = cache()->get($cacheKey, null); + if ($cached !== null) { + return $cached; } - return $doc->getBodyInnerHtml(); + $filterConfig = HtmlContentFilterConfig::fromConfigString(config('app.content_filtering')); + $filter = new HtmlContentFilter($filterConfig); + $filtered = $filter->filterDocument($doc); + + $cacheTime = 86400 * 7; // 1 week + cache()->put($cacheKey, $filtered, $cacheTime); + + return $filtered; + } + + protected function getContentCacheKey(string $html): string + { + $contentHash = md5($html); + $contentId = $this->page->id; + $contentTime = $this->page->updated_at->timestamp; + return "page-content-cache::{$contentId}::{$contentTime}::{$contentHash}"; } /** diff --git a/app/Theming/CustomHtmlHeadContentProvider.php b/app/Theming/CustomHtmlHeadContentProvider.php index e0cf5b3b5..dab30606c 100644 --- a/app/Theming/CustomHtmlHeadContentProvider.php +++ b/app/Theming/CustomHtmlHeadContentProvider.php @@ -4,25 +4,16 @@ namespace BookStack\Theming; use BookStack\Util\CspService; use BookStack\Util\HtmlContentFilter; +use BookStack\Util\HtmlContentFilterConfig; use BookStack\Util\HtmlNonceApplicator; use Illuminate\Contracts\Cache\Repository as Cache; class CustomHtmlHeadContentProvider { - /** - * @var CspService - */ - protected $cspService; - - /** - * @var Cache - */ - protected $cache; - - public function __construct(CspService $cspService, Cache $cache) - { - $this->cspService = $cspService; - $this->cache = $cache; + public function __construct( + protected CspService $cspService, + protected Cache $cache + ) { } /** @@ -50,7 +41,8 @@ class CustomHtmlHeadContentProvider $hash = md5($content); return $this->cache->remember('custom-head-export:' . $hash, 86400, function () use ($content) { - return HtmlContentFilter::removeActiveContentFromHtmlString($content); + $config = new HtmlContentFilterConfig(filterOutNonContentElements: false); + return (new HtmlContentFilter($config))->filterString($content); }); } diff --git a/app/Util/HtmlContentFilter.php b/app/Util/HtmlContentFilter.php index ad5bf8c5f..842e42467 100644 --- a/app/Util/HtmlContentFilter.php +++ b/app/Util/HtmlContentFilter.php @@ -5,15 +5,53 @@ namespace BookStack\Util; use DOMAttr; use DOMElement; use DOMNodeList; +use HTMLPurifier; +use HTMLPurifier_HTML5Config; class HtmlContentFilter { - /** - * Remove all active content from the given HTML document. - * This aims to cover anything which can dynamically deal with, or send, data - * like any JavaScript actions or form content. - */ - public static function removeActiveContentFromDocument(HtmlDocument $doc): void + public function __construct( + protected HtmlContentFilterConfig $config + ) { + } + + public function filterDocument(HtmlDocument $doc): string + { + if ($this->config->filterOutJavaScript) { + $this->filterOutScriptsFromDocument($doc); + } + if ($this->config->filterOutFormElements) { + $this->filterOutFormElementsFromDocument($doc); + } + if ($this->config->filterOutBadHtmlElements) { + $this->filterOutBadHtmlElementsFromDocument($doc); + } + if ($this->config->filterOutNonContentElements) { + $this->filterOutNonContentElementsFromDocument($doc); + } + + $filtered = $doc->getBodyInnerHtml(); + if ($this->config->useAllowListFilter) { + $filtered = $this->applyAllowListFiltering($filtered); + } + + return $filtered; + } + + public function filterString(string $html): string + { + return $this->filterDocument(new HtmlDocument($html)); + } + + protected function applyAllowListFiltering(string $html): string + { + $config = HTMLPurifier_HTML5Config::createDefault(); + $config->set('Cache.SerializerPath', storage_path('purifier')); + $purifier = new HTMLPurifier($config); + return $purifier->purify($html); + } + + protected function filterOutScriptsFromDocument(HtmlDocument $doc): void { // Remove standard script tags $scriptElems = $doc->queryXPath('//script'); @@ -27,10 +65,6 @@ class HtmlContentFilter $badForms = $doc->queryXPath('//*[' . static::xpathContains('@action', 'javascript:') . '] | //*[' . static::xpathContains('@formaction', 'javascript:') . ']'); static::removeNodes($badForms); - // Remove meta tag to prevent external redirects - $metaTags = $doc->queryXPath('//meta[' . static::xpathContains('@content', 'url') . ']'); - static::removeNodes($metaTags); - // Remove data or JavaScript iFrames $badIframes = $doc->queryXPath('//*[' . static::xpathContains('@src', 'data:') . '] | //*[' . static::xpathContains('@src', 'javascript:') . '] | //*[@srcdoc]'); static::removeNodes($badIframes); @@ -49,7 +83,10 @@ class HtmlContentFilter // Remove 'on*' attributes $onAttributes = $doc->queryXPath('//@*[starts-with(name(), \'on\')]'); static::removeAttributes($onAttributes); + } + protected function filterOutFormElementsFromDocument(HtmlDocument $doc): void + { // Remove form elements $formElements = ['form', 'fieldset', 'button', 'textarea', 'select']; foreach ($formElements as $formElement) { @@ -75,41 +112,21 @@ class HtmlContentFilter } } - /** - * Remove active content from the given HTML string. - * This aims to cover anything which can dynamically deal with, or send, data - * like any JavaScript actions or form content. - */ - public static function removeActiveContentFromHtmlString(string $html): string + protected function filterOutBadHtmlElementsFromDocument(HtmlDocument $doc): void { - if (empty($html)) { - return $html; + // Remove meta tag to prevent external redirects + $metaTags = $doc->queryXPath('//meta[' . static::xpathContains('@content', 'url') . ']'); + static::removeNodes($metaTags); + } + + protected function filterOutNonContentElementsFromDocument(HtmlDocument $doc): void + { + // Remove non-content elements + $formElements = ['link', 'style', 'meta', 'title', 'template']; + foreach ($formElements as $formElement) { + $matchingFormElements = $doc->queryXPath('//' . $formElement); + static::removeNodes($matchingFormElements); } - - $doc = new HtmlDocument($html); - static::removeActiveContentFromDocument($doc); - - return $doc->getBodyInnerHtml(); - } - - /** - * Alias using the old method name to avoid potential compatibility breaks during patch release. - * To remove in future feature release. - * @deprecated Use removeActiveContentFromDocument instead. - */ - public static function removeScriptsFromDocument(HtmlDocument $doc): void - { - static::removeActiveContentFromDocument($doc); - } - - /** - * Alias using the old method name to avoid potential compatibility breaks during patch release. - * To remove in future feature release. - * @deprecated Use removeActiveContentFromHtmlString instead. - */ - public static function removeScriptsFromHtmlString(string $html): string - { - return static::removeActiveContentFromHtmlString($html); } /** @@ -147,4 +164,34 @@ class HtmlContentFilter $parentNode->removeAttribute($attrName); } } + + /** + * Alias using the old method name to avoid potential compatibility breaks during patch release. + * To remove in future feature release. + * @deprecated Use filterDocument instead. + */ + public static function removeScriptsFromDocument(HtmlDocument $doc): void + { + $config = new HtmlContentFilterConfig( + filterOutNonContentElements: false, + useAllowListFilter: false, + ); + $filter = new static($config); + $filter->filterDocument($doc); + } + + /** + * Alias using the old method name to avoid potential compatibility breaks during patch release. + * To remove in future feature release. + * @deprecated Use filterString instead. + */ + public static function removeScriptsFromHtmlString(string $html): string + { + $config = new HtmlContentFilterConfig( + filterOutNonContentElements: false, + useAllowListFilter: false, + ); + $filter = new static($config); + return $filter->filterString($html); + } } diff --git a/app/Util/HtmlContentFilterConfig.php b/app/Util/HtmlContentFilterConfig.php new file mode 100644 index 000000000..2cb77ea58 --- /dev/null +++ b/app/Util/HtmlContentFilterConfig.php @@ -0,0 +1,31 @@ +=5.3" + }, + "require-dev": { + "masterminds/html5": "^2.7", + "php-coveralls/php-coveralls": "^1.1|^2.1", + "phpunit/phpunit": ">=4.7 <10.0" + }, + "suggest": { + "masterminds/html5": "Required to use HTMLPurifier_Lexer_HTML5" + }, + "type": "library", + "autoload": { + "classmap": [ + "library/HTMLPurifier/" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "xemlock", + "email": "xemlock@gmail.com" + } + ], + "description": "HTML5 support for HTML Purifier", + "homepage": "https://github.com/xemlock/htmlpurifier-html5", + "keywords": [ + "HTML5", + "Purifier", + "html", + "htmlpurifier", + "security", + "tidy", + "validator", + "xss" + ], + "support": { + "issues": "https://github.com/xemlock/htmlpurifier-html5/issues", + "source": "https://github.com/xemlock/htmlpurifier-html5/tree/v0.1.12" + }, + "time": "2026-02-09T21:03:14+00:00" } ], "packages-dev": [ diff --git a/storage/purifier/.gitignore b/storage/purifier/.gitignore new file mode 100644 index 000000000..c96a04f00 --- /dev/null +++ b/storage/purifier/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore \ No newline at end of file