HTML Document class
HTML-Dokument Klasse
This class is a wrapper for the PHP DOM/XPATH functions. Purpose is to perform often required tasks, such as removing comments, css, scripts, getting links, etc. Take a look at the defined methods - they do what they're named.
Diese Klasse ist ein Wrapper für die PHP DOM/XPATH Funktionen. Zweck ist oft benötigte Aufgaben (wie Kommentare/CSS/JS entfernen, Links herausfiltern usw.) in einem Methodenaufruf zu erledigen. Einfach einen Blick auf die Methoden werden - sie sind nach ihrem Zweck benannt.
Class source code
Klassen-Quelltext
<?php
/**
 * HTML Document manupulation and parsing
 *
 * @gpackage de.atwillys.sw.php.swlib.util
 * @author Stefan Wilhelm
 * @copyright Stefan Wilhelm, 2010
 * @license GPL
 * @version 1.0
 */
 
namespace sw;
 
 
class HtmlDocument {
 
  /**
   * Internal DOM document instance
   *
   * @var \DOMDocument
   */
  private $dom = null;
 
  /**
   * Internal XML path object
   *
   * @var \DOMXPath
   */
  private $xpath = null;
 
  /**
   * Constructor, optionally the HTML content can be set here.
   * @param string $htmlText
   */
  public function __construct($htmlText="") {
    $this->dom = new \DOMDocument();
    $this->xpath = new \DOMXPath($this->dom);
    if(!empty($htmlText)) {
      $this->setHtml($htmlText);
    }
  }
 
  /**
   * Returns the HTML text of the object
   * @return string
   */
  public function __toString() {
    return $this->getHtml();
  }
 
  /**
   * Sets a new HTML text to work with.
   * @param string $html
   * @return \sw\HtmlDocument
   */
  public function setHtml($html) {
    $this->dom->loadHTML(trim($html, "\n\r\t "));
    $this->xpath = new \DOMXPath($this->dom);
    return $this;
  }
 
  /**
   * Returns the HTML text contained in the object
   * @return string
   */
  public function getHtml() {
    return trim($this->dom->saveHTML(), "\n\r\t ");
  }
 
  /**
   *
   */
  public function getLinks() {
    $links = array();
    $nodes = $this->dom->getElementsByTagName('a');
    foreach($nodes as $node) {
      if($node->hasAttributes() && !empty($node->attributes) && $node->attributes->getNamedItem('href')) {
        $l = $node->attributes->getNamedItem('href')->nodeValue;
        if(strlen($l) > 0) {
          if($node->hasChildNodes()) {
            $t = "";
            foreach($this->xpath->query('.//text()', $node) as $sn) {
              $t .= $sn->textContent . " ";
            }
            $links[$l] = trim($t);
          } else {
            $links[$l] = '';
          }
        }
      }
    }
    return $links;
  }
 
  /**
   * Removes <script> ... </script> and <noscript>...</noscript> tags
   * @return \sw\HtmlDocument
   */
  public function removeScriptBlocks() {
    return $this->removeTagsWithSubnodes(array('script','noscript'));
  }
 
  /**
   * Removes <style> ... </style> and <style ... /> tags
   * @return \sw\HtmlDocument
   */
  public function removeStyleBlocks() {
    return $this->removeTagsWithSubnodes(array('style'));
  }
 
  /**
   * Removes comments
   * @return \sw\HtmlDocument
   */
  public function removeComments() {
    return $this->removeByXPathQuery('//comment()');
  }
 
  /**
   * Replaces   with " ".
   * @return \sw\HtmlDocument
   */
  public function nbspToSpace() {
    $this->setHtml(str_ireplace(" ", " ", $this->getHtml()));
    return $this;
  }
 
  /**
   * Removes everything that is not in the <body>. If no <html> or <body> tag
   * is present the whole text is interpreted to be the body (simply does
   * nothing).
   * @return \sw\HtmlDocument
   */
  public function bodyOnly() {
    if(count($this->dom->getElementsByTagName('body')) == 0 || count($this->dom->getElementsByTagName('html')) == 0)  {
      return $this;
    }
    $html = $this->dom->getElementsByTagName('html')->item(0);
    for($i=$html->childNodes->length-1; $i>=0; $i--) {
      if(strtolower($html->childNodes->item($i)->nodeName) != "body") {
        $html->removeChild($html->childNodes->item($i));
      }
    }
    return $this;
  }
 
  /**
   * Removes nodes matching a specified xpath
   * @param string $query
   * @return \sw\HtmlDocument
   */
  public function removeByXPathQuery($query) {
    $comments = $this->xpath->query($query);
    for($i = $comments->length-1; $i>=0; $i--) {
      $node = $comments->item($i);
      if($node->parentNode) {
        $node->parentNode->removeChild($node);
      }
    }
    return $this;
  }
 
  /**
   * Removes all attributes except the specified exceptions (attribute names).
   * Affects all nodes that have attributes.
   * @return \sw\HtmlDocument
   */
  public function removeAttributes(array $exceptions=array('id', 'name', 'class', 'href', 'rel', 'src', 'colspan', 'rowspan')) {
    $exceptions = array_change_key_case(array_combine($exceptions, $exceptions), CASE_LOWER);
    $nodes = $this->xpath->query('//node()');
    foreach($nodes as $n) {
      if($n->hasAttributes() && !is_null($n->attributes)){
        $rm = array();
        foreach($n->attributes as $i => $a) {
          if(!isset($exceptions[strtolower($a->name)])) {
            $rm[] = $a->name;
          }
        }
        while(!empty($rm)) {
          try {
            $n->removeAttribute(array_pop($rm));
          } catch(\Exception $e) {
          }
        }
      }
    }
    return $this;
  }
 
  /**
   * Removes tags without removing the contents or subnodes.
   * @param array $tags
   */
  public function removeTags(array $tags=array()) {
    $html = $this->getHtml();
    foreach($tags as $tag) {
      $tag = preg_replace('/[\W]/i', '', $tag);
      $html = preg_replace('/[\s]*<[\s]*\/?[\s]*'. $tag . '[\s]*\/?[\s]*>/i', '', $html);
    }
    $this->setHtml($html);
    return $this;
  }
 
  /**
   * Removes tags by name including all subnodes
   * @return \sw\HtmlDocument
   */
  public function removeTagsWithSubnodes(array $tags=array()) {
    foreach($tags as $tag) {
      $nodes = $this->dom->getElementsByTagName($tag);
      $n = array();
      foreach($nodes as $node) $n[] = $node;
      while(count($n) > 0) {
        try {
          $node = array_pop($n);
          if($node->parentNode) $node->parentNode->removeChild($node);
        } catch(\Exception $e) {
        }
      }
    }
    return $this;
  }
}

