Page Content: Added more complex & configurable content filtering

- Added new option to control parts of the filter.
- Added whitelist filtering pass via HTMLPurifier.
This commit is contained in:
Dan Brown
2026-02-13 14:14:28 +00:00
parent 46dcc30bf7
commit 10ebe53bd9
10 changed files with 294 additions and 65 deletions

View File

@@ -8,6 +8,7 @@ use BookStack\Permissions\PermissionApplicator;
use BookStack\Users\Models\HasCreatorAndUpdater;
use BookStack\Users\Models\OwnableInterface;
use BookStack\Util\HtmlContentFilter;
use BookStack\Util\HtmlContentFilterConfig;
use Illuminate\Database\Eloquent\Builder;
use Illuminate\Database\Eloquent\Factories\HasFactory;
use Illuminate\Database\Eloquent\Relations\BelongsTo;
@@ -82,7 +83,8 @@ class Comment extends Model implements Loggable, OwnableInterface
public function safeHtml(): string
{
return HtmlContentFilter::removeActiveContentFromHtmlString($this->html ?? '');
$filter = new HtmlContentFilter(new HtmlContentFilterConfig());
return $filter->filterString($this->html ?? '');
}
public function jointPermissions(): HasMany

View File

@@ -42,6 +42,18 @@ return [
// Even when overridden the WYSIWYG editor may still escape script content.
'allow_content_scripts' => env('ALLOW_CONTENT_SCRIPTS', false),
// Control the behaviour of page content filtering.
// This setting is a collection of characters which represent different available filters:
// - j - Filter out JavaScript based content
// - h - Filter out unexpected, potentially dangerous, HTML elements
// - f - Filter out unexpected form elements
// - a - Run content through a more complex allow-list filter
// This defaults to using all filters, unless ALLOW_CONTENT_SCRIPTS is set to true in which case no filters are used.
// Note: These filters are a best attempt, and may not be 100% effective. They are typically a layer used in addition to other security measures.
// TODO - Add to example env
// TODO - Remove allow_content_scripts option above
'content_filtering' => env('CONTENT_FILTERING', env('ALLOW_CONTENT_SCRIPTS', false) === true ? '' : 'jfha'),
// Allow server-side fetches to be performed to potentially unknown
// and user-provided locations. Primarily used in exports when loading
// in externally referenced assets.

View File

@@ -6,6 +6,7 @@ use BookStack\Entities\Models\Book;
use BookStack\Entities\Models\Bookshelf;
use BookStack\Entities\Models\Chapter;
use BookStack\Util\HtmlContentFilter;
use BookStack\Util\HtmlContentFilterConfig;
class EntityHtmlDescription
{
@@ -50,7 +51,8 @@ class EntityHtmlDescription
return $html;
}
return HtmlContentFilter::removeActiveContentFromHtmlString($html);
$filter = new HtmlContentFilter(new HtmlContentFilterConfig());
return $filter->filterString($html);
}
public function getPlain(): string

View File

@@ -13,6 +13,7 @@ use BookStack\Uploads\ImageRepo;
use BookStack\Uploads\ImageService;
use BookStack\Users\Models\User;
use BookStack\Util\HtmlContentFilter;
use BookStack\Util\HtmlContentFilterConfig;
use BookStack\Util\HtmlDocument;
use BookStack\Util\WebSafeMimeSniffer;
use Closure;
@@ -317,11 +318,28 @@ class PageContent
$this->updateIdsRecursively($doc->getBody(), 0, $idMap, $changeMap);
}
if (!config('app.allow_content_scripts')) {
HtmlContentFilter::removeActiveContentFromDocument($doc);
$cacheKey = $this->getContentCacheKey($doc->getBodyInnerHtml());
$cached = cache()->get($cacheKey, null);
if ($cached !== null) {
return $cached;
}
return $doc->getBodyInnerHtml();
$filterConfig = HtmlContentFilterConfig::fromConfigString(config('app.content_filtering'));
$filter = new HtmlContentFilter($filterConfig);
$filtered = $filter->filterDocument($doc);
$cacheTime = 86400 * 7; // 1 week
cache()->put($cacheKey, $filtered, $cacheTime);
return $filtered;
}
protected function getContentCacheKey(string $html): string
{
$contentHash = md5($html);
$contentId = $this->page->id;
$contentTime = $this->page->updated_at->timestamp;
return "page-content-cache::{$contentId}::{$contentTime}::{$contentHash}";
}
/**

View File

@@ -4,25 +4,16 @@ namespace BookStack\Theming;
use BookStack\Util\CspService;
use BookStack\Util\HtmlContentFilter;
use BookStack\Util\HtmlContentFilterConfig;
use BookStack\Util\HtmlNonceApplicator;
use Illuminate\Contracts\Cache\Repository as Cache;
class CustomHtmlHeadContentProvider
{
/**
* @var CspService
*/
protected $cspService;
/**
* @var Cache
*/
protected $cache;
public function __construct(CspService $cspService, Cache $cache)
{
$this->cspService = $cspService;
$this->cache = $cache;
public function __construct(
protected CspService $cspService,
protected Cache $cache
) {
}
/**
@@ -50,7 +41,8 @@ class CustomHtmlHeadContentProvider
$hash = md5($content);
return $this->cache->remember('custom-head-export:' . $hash, 86400, function () use ($content) {
return HtmlContentFilter::removeActiveContentFromHtmlString($content);
$config = new HtmlContentFilterConfig(filterOutNonContentElements: false);
return (new HtmlContentFilter($config))->filterString($content);
});
}

View File

@@ -5,15 +5,53 @@ namespace BookStack\Util;
use DOMAttr;
use DOMElement;
use DOMNodeList;
use HTMLPurifier;
use HTMLPurifier_HTML5Config;
class HtmlContentFilter
{
/**
* Remove all active content from the given HTML document.
* This aims to cover anything which can dynamically deal with, or send, data
* like any JavaScript actions or form content.
*/
public static function removeActiveContentFromDocument(HtmlDocument $doc): void
public function __construct(
protected HtmlContentFilterConfig $config
) {
}
public function filterDocument(HtmlDocument $doc): string
{
if ($this->config->filterOutJavaScript) {
$this->filterOutScriptsFromDocument($doc);
}
if ($this->config->filterOutFormElements) {
$this->filterOutFormElementsFromDocument($doc);
}
if ($this->config->filterOutBadHtmlElements) {
$this->filterOutBadHtmlElementsFromDocument($doc);
}
if ($this->config->filterOutNonContentElements) {
$this->filterOutNonContentElementsFromDocument($doc);
}
$filtered = $doc->getBodyInnerHtml();
if ($this->config->useAllowListFilter) {
$filtered = $this->applyAllowListFiltering($filtered);
}
return $filtered;
}
public function filterString(string $html): string
{
return $this->filterDocument(new HtmlDocument($html));
}
protected function applyAllowListFiltering(string $html): string
{
$config = HTMLPurifier_HTML5Config::createDefault();
$config->set('Cache.SerializerPath', storage_path('purifier'));
$purifier = new HTMLPurifier($config);
return $purifier->purify($html);
}
protected function filterOutScriptsFromDocument(HtmlDocument $doc): void
{
// Remove standard script tags
$scriptElems = $doc->queryXPath('//script');
@@ -27,10 +65,6 @@ class HtmlContentFilter
$badForms = $doc->queryXPath('//*[' . static::xpathContains('@action', 'javascript:') . '] | //*[' . static::xpathContains('@formaction', 'javascript:') . ']');
static::removeNodes($badForms);
// Remove meta tag to prevent external redirects
$metaTags = $doc->queryXPath('//meta[' . static::xpathContains('@content', 'url') . ']');
static::removeNodes($metaTags);
// Remove data or JavaScript iFrames
$badIframes = $doc->queryXPath('//*[' . static::xpathContains('@src', 'data:') . '] | //*[' . static::xpathContains('@src', 'javascript:') . '] | //*[@srcdoc]');
static::removeNodes($badIframes);
@@ -49,7 +83,10 @@ class HtmlContentFilter
// Remove 'on*' attributes
$onAttributes = $doc->queryXPath('//@*[starts-with(name(), \'on\')]');
static::removeAttributes($onAttributes);
}
protected function filterOutFormElementsFromDocument(HtmlDocument $doc): void
{
// Remove form elements
$formElements = ['form', 'fieldset', 'button', 'textarea', 'select'];
foreach ($formElements as $formElement) {
@@ -75,41 +112,21 @@ class HtmlContentFilter
}
}
/**
* Remove active content from the given HTML string.
* This aims to cover anything which can dynamically deal with, or send, data
* like any JavaScript actions or form content.
*/
public static function removeActiveContentFromHtmlString(string $html): string
protected function filterOutBadHtmlElementsFromDocument(HtmlDocument $doc): void
{
if (empty($html)) {
return $html;
// Remove meta tag to prevent external redirects
$metaTags = $doc->queryXPath('//meta[' . static::xpathContains('@content', 'url') . ']');
static::removeNodes($metaTags);
}
protected function filterOutNonContentElementsFromDocument(HtmlDocument $doc): void
{
// Remove non-content elements
$formElements = ['link', 'style', 'meta', 'title', 'template'];
foreach ($formElements as $formElement) {
$matchingFormElements = $doc->queryXPath('//' . $formElement);
static::removeNodes($matchingFormElements);
}
$doc = new HtmlDocument($html);
static::removeActiveContentFromDocument($doc);
return $doc->getBodyInnerHtml();
}
/**
* Alias using the old method name to avoid potential compatibility breaks during patch release.
* To remove in future feature release.
* @deprecated Use removeActiveContentFromDocument instead.
*/
public static function removeScriptsFromDocument(HtmlDocument $doc): void
{
static::removeActiveContentFromDocument($doc);
}
/**
* Alias using the old method name to avoid potential compatibility breaks during patch release.
* To remove in future feature release.
* @deprecated Use removeActiveContentFromHtmlString instead.
*/
public static function removeScriptsFromHtmlString(string $html): string
{
return static::removeActiveContentFromHtmlString($html);
}
/**
@@ -147,4 +164,34 @@ class HtmlContentFilter
$parentNode->removeAttribute($attrName);
}
}
/**
* Alias using the old method name to avoid potential compatibility breaks during patch release.
* To remove in future feature release.
* @deprecated Use filterDocument instead.
*/
public static function removeScriptsFromDocument(HtmlDocument $doc): void
{
$config = new HtmlContentFilterConfig(
filterOutNonContentElements: false,
useAllowListFilter: false,
);
$filter = new static($config);
$filter->filterDocument($doc);
}
/**
* Alias using the old method name to avoid potential compatibility breaks during patch release.
* To remove in future feature release.
* @deprecated Use filterString instead.
*/
public static function removeScriptsFromHtmlString(string $html): string
{
$config = new HtmlContentFilterConfig(
filterOutNonContentElements: false,
useAllowListFilter: false,
);
$filter = new static($config);
return $filter->filterString($html);
}
}

View File

@@ -0,0 +1,31 @@
<?php
namespace BookStack\Util;
readonly class HtmlContentFilterConfig
{
public function __construct(
public bool $filterOutJavaScript = true,
public bool $filterOutBadHtmlElements = true,
public bool $filterOutFormElements = true,
public bool $filterOutNonContentElements = true,
public bool $useAllowListFilter = true,
) {
}
/**
* Create an instance from a config string, where the string
* is a combination of characters to enable filters.
*/
public static function fromConfigString(string $config): self
{
$config = strtolower($config);
return new self(
filterOutJavaScript: str_contains($config, 'j'),
filterOutBadHtmlElements: str_contains($config, 'h'),
filterOutFormElements: str_contains($config, 'f'),
filterOutNonContentElements: str_contains($config, 'h'),
useAllowListFilter: str_contains($config, 'a'),
);
}
}