Content: Tuned HTML purifier for our use

Tested it with a range of supported, including uncommon, content types
and added support, or changed config, where needed.
Been through docs for all HTMLPurifier options to assess what's
relevant.
This commit is contained in:
Dan Brown
2026-02-15 16:17:03 +00:00
parent 10ebe53bd9
commit 0f040fe8b1
3 changed files with 104 additions and 6 deletions

View File

@@ -321,12 +321,13 @@ class PageContent
$cacheKey = $this->getContentCacheKey($doc->getBodyInnerHtml());
$cached = cache()->get($cacheKey, null);
if ($cached !== null) {
return $cached;
// return $cached;
}
$filterConfig = HtmlContentFilterConfig::fromConfigString(config('app.content_filtering'));
$filter = new HtmlContentFilter($filterConfig);
$filtered = $filter->filterDocument($doc);
// $filtered = $doc->getBodyInnerHtml();
$cacheTime = 86400 * 7; // 1 week
cache()->put($cacheKey, $filtered, $cacheTime);

View File

@@ -0,0 +1,101 @@
<?php
namespace BookStack\Util;
use HTMLPurifier;
use HTMLPurifier_Config;
use HTMLPurifier_HTML5Config;
use HTMLPurifier_HTMLDefinition;
class ConfiguredHtmlPurifier
{
protected HTMLPurifier $purifier;
public function __construct()
{
$config = HTMLPurifier_HTML5Config::createDefault();
$this->setConfig($config);
$htmlDef = $config->getDefinition('HTML', true, true);
if ($htmlDef instanceof HTMLPurifier_HTMLDefinition) {
$this->configureDefinition($htmlDef);
}
$this->purifier = new HTMLPurifier($config);
}
protected function setConfig(HTMLPurifier_Config $config): void
{
$config->set('Cache.SerializerPath', storage_path('purifier'));
$config->set('CSS.AllowTricky', true);
$config->set('HTML.SafeIframe', true);
$config->set('Attr.EnableID', true);
$config->set('Attr.ID.HTML5', true);
$config->set('Output.FixInnerHTML', false);
$config->set('URI.SafeIframeRegexp', '%^(http://|https://)%');
$config->set('URI.AllowedSchemes', [
'http' => true,
'https' => true,
'mailto' => true,
'ftp' => true,
'nntp' => true,
'news' => true,
'tel' => true,
'file' => true,
]);
$config->set('Cache.DefinitionImpl', null); // Disable cache during testing
}
public function configureDefinition(HTMLPurifier_HTMLDefinition $definition): void
{
// Allow the object element
$definition->addElement(
'object',
'Inline',
'Flow',
'Common',
[
'data' => 'URI',
'type' => 'Text',
'width' => 'Length',
'height' => 'Length',
]
);
// Allow the embed element
$definition->addElement(
'embed',
'Inline',
'Empty',
'Common',
[
'src' => 'URI',
'type' => 'Text',
'width' => 'Length',
'height' => 'Length',
]
);
// Allow checkbox inputs
$definition->addElement(
'input',
'Formctrl',
'Empty',
'Common',
[
'checked' => 'Bool#checked',
'disabled' => 'Bool#disabled',
'name' => 'Text',
'readonly' => 'Bool#readonly',
'type' => 'Enum#checkbox',
'value' => 'Text',
]
);
}
public function purify(string $html): string
{
return $this->purifier->purify($html);
}
}

View File

@@ -5,8 +5,6 @@ namespace BookStack\Util;
use DOMAttr;
use DOMElement;
use DOMNodeList;
use HTMLPurifier;
use HTMLPurifier_HTML5Config;
class HtmlContentFilter
{
@@ -45,9 +43,7 @@ class HtmlContentFilter
protected function applyAllowListFiltering(string $html): string
{
$config = HTMLPurifier_HTML5Config::createDefault();
$config->set('Cache.SerializerPath', storage_path('purifier'));
$purifier = new HTMLPurifier($config);
$purifier = new ConfiguredHtmlPurifier();
return $purifier->purify($html);
}