Revamped the Html Truncator code to address issues with invalid HTML #1019

2025-02-20 19:56:53 +01:00 · 2016-09-06 15:38:36 -06:00 · 2016-09-06 15:38:36 -06:00 · a54f30b8ae
commit a54f30b8ae
parent 10825d3f70
6 changed files with 229 additions and 164 deletions
--- a/composer.json
+++ b/composer.json
@ -28,7 +28,8 @@
        "ext-openssl": "*",
        "ext-curl": "*",
        "ext-zip": "*",
-        "league/climate": "^3.2"
+        "league/climate": "^3.2",
+        "antoligy/dom-string-iterators": "^1.0"
    },
    "require-dev": {
        "codeception/codeception": "^2.1",
--- a/composer.lock
+++ b/composer.lock
@ -4,9 +4,53 @@
        "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#composer-lock-the-lock-file",
        "This file is @generated automatically"
    ],
-    "hash": "25e59d23a9af7f43dd9cd9d462057abd",
-    "content-hash": "22973a67f2eae64610e739fa82a3d60b",
+    "hash": "7a8caecbaedbf785d96b7437f296ca66",
+    "content-hash": "2fec25b3b5d627c0896d5ee3030b6bed",
    "packages": [
+        {
+            "name": "antoligy/dom-string-iterators",
+            "version": "v1.0.0",
+            "source": {
+                "type": "git",
+                "url": "https://github.com/antoligy/dom-string-iterators.git",
+                "reference": "9a624b082493fee9b972840dbd677494edb94cf7"
+            },
+            "dist": {
+                "type": "zip",
+                "url": "https://api.github.com/repos/antoligy/dom-string-iterators/zipball/9a624b082493fee9b972840dbd677494edb94cf7",
+                "reference": "9a624b082493fee9b972840dbd677494edb94cf7",
+                "shasum": ""
+            },
+            "require": {
+                "php": ">=5.3.0"
+            },
+            "type": "library",
+            "autoload": {
+                "psr-4": {
+                    "": "src/"
+                }
+            },
+            "notification-url": "https://packagist.org/downloads/",
+            "license": [
+                "Public Domain"
+            ],
+            "authors": [
+                {
+                    "name": "Alex Wilson",
+                    "email": "a@ax.gy"
+                },
+                {
+                    "name": "Kornel Lesinski",
+                    "email": "pornel@pornel.net"
+                },
+                {
+                    "name": "Patrick Galbraith",
+                    "email": "patrick.j.galbraith@gmail.com"
+                }
+            ],
+            "description": "Composer package for DOMWordsIterator and DOMLettersIterator",
+            "time": "2015-11-04 17:33:14"
+        },
        {
            "name": "doctrine/cache",
            "version": "v1.6.0",
--- a/system/src/Grav/Common/Helpers/Truncator.php
+++ b/system/src/Grav/Common/Helpers/Truncator.php
@ -8,12 +8,16 @@

 namespace Grav\Common\Helpers;

+use DOMText;
 use DOMDocument;
+use DOMWordsIterator;
+use DOMLettersIterator;

 /**
- * This file is part of urodoz/truncateHTML.
+ * This file is part of https://github.com/Bluetel-Solutions/twig-truncate-extension
 *
- * (c) Albert Lacarta <urodoz@gmail.com>
+ * Copyright (c) 2015 Bluetel Solutions developers@bluetel.co.uk
+ * Copyright (c) 2015 Alex Wilson ajw@bluetel.co.uk
 *
 * For the full copyright and license information, please view the LICENSE
 * file that was distributed with this source code.
@ -21,181 +25,188 @@ use DOMDocument;

 class Truncator {

-    public static $default_options = array(
-        'ellipsis' => '…',
-        'break' => ' ',
-        'length_in_chars' => false,
-        'word_safe' => false,
-    );
+    /**
+     * Safely truncates HTML by a given number of words.
+     * @param  string  $html     Input HTML.
+     * @param  integer $limit    Limit to how many words we preserve.
+     * @param  string  $ellipsis String to use as ellipsis (if any).
+     * @return string            Safe truncated HTML.
+     */
+    public static function truncateWords($html, $limit = 0, $ellipsis = "")
+    {
+        if ($limit <= 0) {
+            return $html;
+        }

-    // These tags are allowed to have an ellipsis inside
-    public static $ellipsable_tags = array(
-        'p', 'ol', 'ul', 'li',
-        'div', 'header', 'article', 'nav',
-        'section', 'footer', 'aside',
-        'dd', 'dt', 'dl',
-    );
+        $dom = self::htmlToDomDocument($html);

-    public static $self_closing_tags = array(
-        'br', 'hr', 'img',
-    );
+        // Grab the body of our DOM.
+        $body = $dom->getElementsByTagName("body")->item(0);
+
+        // Iterate over words.
+        $words = new DOMWordsIterator($body);
+        foreach ($words as $word) {
+
+            // If we have exceeded the limit, we delete the remainder of the content.
+            if ($words->key() >= $limit) {
+
+                // Grab current position.
+                $currentWordPosition = $words->currentWordPosition();
+                $curNode = $currentWordPosition[0];
+                $offset = $currentWordPosition[1];
+                $words = $currentWordPosition[2];
+
+                $curNode->nodeValue = substr(
+                    $curNode->nodeValue,
+                    0,
+                    $words[$offset][1] + strlen($words[$offset][0])
+                );
+
+                self::removeProceedingNodes($curNode, $body);
+
+                if (!empty($ellipsis)) {
+                    self::insertEllipsis($curNode, $ellipsis);
+                }
+
+                break;
+            }
+
+        }
+
+        return self::innerHTML($body);
+    }

    /**
-     * Truncate given HTML string to specified length.
-     * If length_in_chars is false it's trimmed by number
-     * of words, otherwise by number of characters.
-     *
-     * @param  string        $html
-     * @param  integer       $length
-     * @param  string|array  $opts
-     * @return string
+     * Safely truncates HTML by a given number of letters.
+     * @param  string  $html     Input HTML.
+     * @param  integer $limit    Limit to how many letters we preserve.
+     * @param  string  $ellipsis String to use as ellipsis (if any).
+     * @return string            Safe truncated HTML.
     */
-    public static function truncate($html, $length, $opts=array())
+    public static function truncateLetters($html, $limit = 0, $ellipsis = "")
    {
-        if (is_string($opts)) $opts = array('ellipsis' => $opts);
-        $opts = array_merge(static::$default_options, $opts);
-        // wrap the html in case it consists of adjacent nodes like <p>foo</p><p>bar</p>
-        $html = mb_convert_encoding("<div>".$html."</div>", 'HTML-ENTITIES', 'UTF-8');
-
-        $root_node = null;
-        // Parse using HTML5Lib if it's available.
-        if (class_exists('HTML5Lib\\Parser')) {
-            try {
-                $doc = \HTML5Lib\Parser::parse($html);
-                $root_node = $doc->documentElement->lastChild->lastChild;
-            }
-            catch (\Exception $e) {
-                ;
-            }
+        if ($limit <= 0) {
+            return $html;
        }
-        if ($root_node === null) {
-            // HTML5Lib not available so we'll have to use DOMDocument
-            // We'll only be able to parse HTML5 if it's valid XML
-            $doc = new DOMDocument('4.01', 'utf-8');
-            $doc->formatOutput = false;
-            $doc->preserveWhiteSpace = true;
-            // loadHTML will fail with HTML5 tags (article, nav, etc)
-            // so we need to suppress errors and if it fails to parse we
-            // retry with the XML parser instead
-            $prev_use_errors = libxml_use_internal_errors(true);
-            if ($doc->loadHTML($html)) {
-                $root_node = $doc->documentElement->lastChild->lastChild;
-            }
-            else if ($doc->loadXML($html)) {
-                $root_node = $doc->documentElement;
-            }
-            else {
-                libxml_use_internal_errors($prev_use_errors);
-                throw new \RuntimeException;
-            }
-            libxml_use_internal_errors($prev_use_errors);
-        }
-        list($text, $_, $opts) = static::truncateNode($doc, $root_node, $length, $opts);

-        $text = mb_substr(mb_substr($text, 0, -6), 5);
+        $dom = self::htmlToDomDocument($html);

-        return $text;
-    }
+        // Grab the body of our DOM.
+        $body = $dom->getElementsByTagName("body")->item(0);

-    protected static function truncateNode($doc, $node, $length, $opts)
-    {
-        if ($length === 0 && !static::ellipsable($node)) {
-            return array('', 1, $opts);
-        }
-        list($inner, $remaining, $opts) = static::innerTruncate($doc, $node, $length, $opts);
-        if (0 === mb_strlen($inner)) {
-            return array(in_array(mb_strtolower($node->nodeName), static::$self_closing_tags) ? $doc->saveXML($node) : "", $length - $remaining, $opts);
-        }
-        while($node->firstChild) {
-            $node->removeChild($node->firstChild);
-        }
-        $newNode = $doc->createDocumentFragment();
-        // handle the ampersand
-        $newNode->appendXml(static::xmlEscape($inner));
-        $node->appendChild($newNode);
-        return array($doc->saveXML($node), $length - $remaining, $opts);
-    }
+        // Iterate over letters.
+        $letters = new DOMLettersIterator($body);
+        foreach ($letters as $letter) {

-    protected static function innerTruncate($doc, $node, $length, $opts)
-    {
-        $inner = '';
-        $remaining = $length;
-        foreach($node->childNodes as $childNode) {
-            if ($childNode->nodeType === XML_ELEMENT_NODE) {
-                list($txt, $nb, $opts) = static::truncateNode($doc, $childNode, $remaining, $opts);
-            }
-            else if ($childNode->nodeType === XML_TEXT_NODE) {
-                list($txt, $nb, $opts) = static::truncateText($childNode, $remaining, $opts);
-            } else {
-                $txt = '';
-                $nb  = 0;
-            }
+            // If we have exceeded the limit, we want to delete the remainder of this document.
+            if ($letters->key() >= $limit) {

-            // unhandle the ampersand
-            $txt = static::xmlUnescape($txt);
+                $currentText = $letters->currentTextPosition();
+                $currentText[0]->nodeValue = substr($currentText[0]->nodeValue, 0, $currentText[1] + 1);
+                self::removeProceedingNodes($currentText[0], $body);

-            $remaining -= $nb;
-            $inner .= $txt;
-            if ($remaining < 0) {
-                if (static::ellipsable($node)) {
-                    $inner = preg_replace('/(?:[\s\pP]+|(?:&(?:[a-z]+|#[0-9]+);?))*$/u', '', $inner).$opts['ellipsis'];
-                    $opts['ellipsis'] = '';
-                    $opts['was_truncated'] = true;
+                if (!empty($ellipsis)) {
+                    self::insertEllipsis($currentText[0], $ellipsis);
                }
+
                break;
            }
        }
-        return array($inner, $remaining, $opts);
+
+        return self::innerHTML($body);
    }

-    protected static function truncateText($node, $length, $opts)
+    /**
+     * Builds a DOMDocument object from a string containing HTML.
+     * @param string HTML to load
+     * @returns DOMDocument Returns a DOMDocument object.
+     */
+    public static function htmlToDomDocument($html)
    {
-        $string = $node->textContent;
+        // Transform multibyte entities which otherwise display incorrectly.
+        $html = mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8');

-        if ($opts['length_in_chars']) {
-            $count = mb_strlen($string);
-            if ($count <= $length && $length > 0) {
-                return array($string, $count, $opts);
-            }
-            if ($opts['word_safe']) {
-                if (false !== ($breakpoint = mb_strpos($string, $opts['break'], $length))) {
-                    if ($breakpoint < mb_strlen($string) - 1) {
-                        $string = mb_substr($string, 0, $breakpoint) . $opts['break'];
-                    }
+        // Internal errors enabled as HTML5 not fully supported.
+        libxml_use_internal_errors(true);
+
+        // Instantiate new DOMDocument object, and then load in UTF-8 HTML.
+        $dom = new DOMDocument();
+        $dom->encoding = 'UTF-8';
+        $dom->loadHTML($html);
+
+        return $dom;
+    }
+
+    /**
+     * Removes all nodes after the current node.
+     * @param  DOMNode|DOMElement $domNode
+     * @param  DOMNode|DOMElement $topNode
+     * @return void
+     */
+    private static function removeProceedingNodes($domNode, $topNode)
+    {
+        $nextNode = $domNode->nextSibling;
+
+        if ($nextNode !== null) {
+            self::removeProceedingNodes($nextNode, $topNode);
+            $domNode->parentNode->removeChild($nextNode);
+        } else {
+            //scan upwards till we find a sibling
+            $curNode = $domNode->parentNode;
+            while ($curNode !== $topNode) {
+                if ($curNode->nextSibling !== null) {
+                    $curNode = $curNode->nextSibling;
+                    self::removeProceedingNodes($curNode, $topNode);
+                    $curNode->parentNode->removeChild($curNode);
+                    break;
                }
-                return array($string, $count, $opts);
+                $curNode = $curNode->parentNode;
            }
-            return array(mb_substr($node->textContent, 0, $length), $count, $opts);
-        }
-        else {
-            preg_match_all('/\s*\S+/', $string, $words);
-            $words = $words[0];
-            $count = count($words);
-            if ($count <= $length && $length > 0) {
-                return array($string, $count, $opts);
-            }
-            return array(implode('', array_slice($words, 0, $length)), $count, $opts);
        }
    }

-    protected static function ellipsable($node)
+    /**
+     * Inserts an ellipsis
+     * @param  DOMNode|DOMElement $domNode  Element to insert after.
+     * @param  string             $ellipsis Text used to suffix our document.
+     * @return void
+     */
+    private static function insertEllipsis($domNode, $ellipsis)
    {
-        return ($node instanceof DOMDocument)
-        || in_array(mb_strtolower($node->nodeName), static::$ellipsable_tags)
-            ;
+        $avoid = array('a', 'strong', 'em', 'h1', 'h2', 'h3', 'h4', 'h5'); //html tags to avoid appending the ellipsis to
+
+        if (in_array($domNode->parentNode->nodeName, $avoid) && $domNode->parentNode->parentNode !== null) {
+            // Append as text node to parent instead
+            $textNode = new DOMText($ellipsis);
+
+            if ($domNode->parentNode->parentNode->nextSibling) {
+                $domNode->parentNode->parentNode->insertBefore($textNode, $domNode->parentNode->parentNode->nextSibling);
+            } else {
+                $domNode->parentNode->parentNode->appendChild($textNode);
+            }
+
+        } else {
+            // Append to current node
+            $domNode->nodeValue = rtrim($domNode->nodeValue) . $ellipsis;
+        }
    }

-    protected static function xmlEscape($string)
-    {
-        $string = str_replace('&', '&amp;', $string);
-        $string = str_replace('<?', '&lt;?', $string);
-        return $string;
+    /**
+     * Returns the innerHTML of a particular DOMElement
+     *
+     * @param $element
+     * @return string
+     */
+    private static function innerHTML($element) {
+        $innerHTML = "";
+        $children = $element->childNodes;
+        foreach ($children as $child)
+        {
+            $tmp_dom = new DOMDocument();
+            $tmp_dom->appendChild($tmp_dom->importNode($child, true));
+            $innerHTML.=trim($tmp_dom->saveHTML());
+        }
+        return $innerHTML;
    }

-    protected static function xmlUnescape($string)
-    {
-        $string = str_replace('&amp;', '&', $string);
-        $string = str_replace('&lt;?', '<?', $string);
-        return $string;
-    }
 }
--- a/system/src/Grav/Common/Page/Page.php
+++ b/system/src/Grav/Common/Page/Page.php
@ -502,7 +502,8 @@ class Page
            $size = 300;
        }

-        return html_entity_decode(Utils::truncateHTML($content, $size));
+        $summary = Utils::truncateHTML($content, $size);
+        return html_entity_decode($summary);
    }

    /**
--- a/system/src/Grav/Common/Utils.php
+++ b/system/src/Grav/Common/Utils.php
@ -183,26 +183,28 @@ abstract class Utils
     * Truncate HTML by number of characters. not "word-safe"!
     *
     * @param  string $text
-     * @param  int    $length
+     * @param  int $length in characters
+     * @param  string $ellipsis
     *
     * @return string
     */
-    public static function truncateHtml($text, $length = 100)
+    public static function truncateHtml($text, $length = 100, $ellipsis = '...')
    {
-        return Truncator::truncate($text, $length, ['length_in_chars' => true]);
+        return Truncator::truncateLetters($text, $length, $ellipsis);
    }

    /**
     * Truncate HTML by number of characters in a "word-safe" manor.
     *
     * @param  string $text
-     * @param  int    $length
+     * @param  int    $length in words
+     * @param  string $ellipsis
     *
     * @return string
     */
-    public static function safeTruncateHtml($text, $length = 100)
+    public static function safeTruncateHtml($text, $length = 25, $ellipsis = '...')
    {
-        return Truncator::truncate($text, $length, ['length_in_chars' => true, 'word_safe' => true]);
+        return Truncator::truncateWords($text, $length, $ellipsis);
    }

    /**
--- a/tests/unit/Grav/Common/UtilsTest.php
+++ b/tests/unit/Grav/Common/UtilsTest.php
@ -123,16 +123,22 @@ class UtilsTest extends \Codeception\TestCase\Test

    public function testTruncateHtml()
    {
-        $this->assertEquals('<p>T…</p>', Utils::truncateHtml('<p>This is a string to truncate</p>', 1));
-        $this->assertEquals('<p>This…</p>', Utils::truncateHtml('<p>This is a string to truncate</p>', 4));
-        $this->assertEquals('', Utils::truncateHtml('<input type="file" id="file" multiple />', 6, true));
-
+        $this->assertEquals('<p>T...</p>', Utils::truncateHtml('<p>This is a string to truncate</p>', 1));
+        $this->assertEquals('<p>This...</p>', Utils::truncateHtml('<p>This is a string to truncate</p>', 4));
+        $this->assertEquals('<p>This is a...</p>', Utils::truncateHtml('<p>This is a string to truncate</p>', 10));
+        $this->assertEquals('<p>This is a string to truncate</p>', Utils::truncateHtml('<p>This is a string to truncate</p>', 100));
+        $this->assertEquals('<input type="file" id="file" multiple>', Utils::truncateHtml('<input type="file" id="file" multiple />', 6));
+        $this->assertEquals('<ol><li>item 1 <i>so...</i></li></ol>', Utils::truncateHtml('<ol><li>item 1 <i>something</i></li><li>item 2 <strong>bold</strong></li></ol>', 10));
    }

    public function testSafeTruncateHtml()
    {
-        $this->assertEquals('<p>This…</p>', Utils::safeTruncateHtml('<p>This is a string to truncate</p>', 1));
-        $this->assertEquals('<p>This…</p>', Utils::safeTruncateHtml('<p>This is a string to truncate</p>', 4));
+        $this->assertEquals('<p>This...</p>', Utils::safeTruncateHtml('<p>This is a string to truncate</p>', 1));
+        $this->assertEquals('<p>This is...</p>', Utils::safeTruncateHtml('<p>This is a string to truncate</p>', 2));
+        $this->assertEquals('<p>This is a string to...</p>', Utils::safeTruncateHtml('<p>This is a string to truncate</p>', 5));
+        $this->assertEquals('<p>This is a string to truncate</p>', Utils::safeTruncateHtml('<p>This is a string to truncate</p>', 20));
+        $this->assertEquals('<input type="file" id="file" multiple>', Utils::safeTruncateHtml('<input type="file" id="file" multiple />', 6));
+        $this->assertEquals('<ol><li>item 1 <i>something</i></li><li>item 2...</li></ol>', Utils::safeTruncateHtml('<ol><li>item 1 <i>something</i></li><li>item 2 <strong>bold</strong></li></ol>', 5));
    }

    public function testGenerateRandomString()