/**
 * @package de.atwillys.cc.swl
 * @license BSD (simplified)
 * @author Stefan Wilhelm (stfwi)
 *
 * @file pcre.hh
 * @ccflags -Ipcre/include -Wno-long-long
 * @ldflags -lpcrecpp || libpcrecpp.a libpcre.a
 * @platform linux, bsd, windows
 * @standard >= c++98
 *
 * -----------------------------------------------------------------------------
 *
 * PCRE wrapper class template with implicit pattern parsing for text
 * extraction / replacement.  As search/match/replace/extract specifications
 * are given as one string, this class is suitable to be easily used as user
 * definable pre/post processing, e.g. via command line arguments or configuration
 * files.
 *
 * Perl-like patterns e.g.:
 *
 *  - '/pattern/mods'              returns first match
 *  - '/pattern/extract/mods'      returns first match (with replacement spec)
 *  - 's/pattern/replace/mods'     replaces all occurrences
 *
 *  - allowed separators: `/`, `|`, `#` (e.g. `m|pattern|opts`)
 *
 *  - allowed modifiers:
 *
 *   `i`  Ignore case (as in Perl).
 *   `x`  Permit whitespaces and comments in the pattern (as in Perl).
 *   `m`  Multi line: `^` and `$` match start/end of the whole text (as in Perl).
 *   `s`  `.` matches newlines as well (as in Perl)." nl2
 *   `$`  `$` matches only at the end (else normal dollar sign)." nl
 *   `!`  Meaning of `*?` and `*` swapped (`*?` now consumes as much as possible).
 *   `*`  Disable parenthesise (subexpression) matching.
 *   `X`  Extra (PCRE strict escape parsing).
 *
 * Pattern examples:
 *
 *  "/([xy]=[\\d\\.e])/\\1:\\2/"    Extract first of x,y=float, reformat = to :
 *  "m/([xyz]=[\\d\\.e])/$1:$2/"    Same as above
 *  "s/([xyz]=[\\d\\.e])/\\1:\\2/"  Replace all x,y,z=float from `=` to `:`
 *  "s| [\\n]*(abc) [\\s]* |X|smix" Replace abc with X, ignore case, multiline
 *
 * Usage example:
 *
 *  pcre_regex re;
 *  re.pattern(my_pattern).apply_to(string_reference);
 *  if(re.ok()) { ... } else { throw re.error(); }
 *
 * Template specialisation (std::string):
 *
 *  - typedef detail::basic_pcre<std::string> pcre_regex;
 *
 * -----------------------------------------------------------------------------
 *
 * Hint: Getting/building PCRE from source
 *
 * In your makefile this the target `update-pcre`, which will retrieve the
 * data form the official SVN repository into the subdirectory `pcre`, build,
 * and strip everything except the includes and libs.
 *
 *   +++ Makefile +++
 *
 *   .PHONY: update-pcre
 *   update-pcre:
 *     @-rm -rf pcre
 *     @mkdir pcre
 *     @cd pcre; svn co svn://vcs.exim.org/pcre/code/trunk src
 *     @cd pcre/src; ./autogen.sh
 *     @cd pcre/src; ./configure --enable-utf --prefix=$(shell pwd)/pcre/
 *     @cd pcre/src; make
 *     @cd pcre/src; make install
 *     @cd pcre/src; make clean
 *     @cd pcre; rm -rf bin
 *     @cd pcre; rm -rf share
 *     @cd pcre/lib; rm -rf pkgconfig
 *     @cd pcre; rm -rf src
 *
 * -----------------------------------------------------------------------------
 * +++ BSD license header +++
 * Copyright (c) 2009-2014, Stefan Wilhelm (stfwi, <cerbero s@atwilly s.de>)
 * All rights reserved.
 * Redistribution and use in source and binary forms, with or without modification,
 * are permitted provided that the following conditions are met: (1) Redistributions
 * of source code must retain the above copyright notice, this list of conditions
 * and the following disclaimer. (2) Redistributions in binary form must reproduce
 * the above copyright notice, this list of conditions and the following disclaimer
 * in the documentation and/or other materials provided with the distribution.
 * (3) Neither the name of atwillys.de nor the names of its contributors may be
 * used to endorse or promote products derived from this software without specific
 * prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS
 * AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
 * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER
 * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
 * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
 * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
 * DAMAGE.
 * -----------------------------------------------------------------------------
 */
#ifndef SW__PCRE_HH
#define SW__PCRE_HH

#include <pcrecpp.h>
#include <string>
#include <iostream>
#include <vector>

using namespace std;

namespace sw { namespace detail {

template <typename str_t>
class basic_pcre {
public:

  /**
   * Construct empty PCRE
   */
  inline basic_pcre() : is_replace_(false), is_global_(false), sep_('/'),
      separators_("/|#%"), pattern_(), srch_(), flgs_(), repl_(), error_(),
      re_("")
  { ; }

  /**
   * PCRE with pattern to compile (immediately compiled)
   * @param const str_t pattern__
   */
  inline basic_pcre(const str_t pattern__) : is_replace_(false), is_global_(false),
      sep_('/'), separators_("/|#%"), pattern_(), srch_(), flgs_(),
      repl_(), error_(), re_("")
  { pattern(pattern__); }

  /**
   * Copy contstructor
   * @param re__
   */
  inline basic_pcre(const basic_pcre &re__) : is_replace_(re__.is_replace_),
      is_global_(re__.is_global_), sep_(re__.sep_), separators_(re__.separators_),
      pattern_(re__.pattern_), srch_(re__.srch_), flgs_(re__.flgs_), repl_(re__.repl_),
      error_(re__.error_), re_(re__.re_)
  { ; }

  /**
   * Destructor
   */
  virtual ~basic_pcre()
  { ; }

public:

  /**
   * Returns the complete pattern given
   * @return const str_t &
   */
  inline const str_t & pattern() const
  { return pattern_; }

  /**
   * Returns parsed search part of the pattern
   * @return const str_t &
   */
  inline const str_t & search() const
  { return srch_; }

  /**
   * Returns parsed replace part of the pattern (empty if no replace)
   * @return const str_t &
   */
  inline const str_t & replace() const
  { return repl_; }

  /**
   * Returns search/replace options part of the pattern
   * @return const str_t &
   */
  inline const str_t & modifiers() const
  { return flgs_; }

  /**
   * Returns an error text, empty string if no error
   * @return const str_t &
   */
  inline const str_t & error() const
  { return error_; }

  /**
   * Returns true if there is no error.
   * @return bool
   */
  inline bool ok() const
  { return error_.empty(); }

  /**
   * Returns true if global search/replace (all occurances, not only first one)
   * is set.
   * @return bool
   */
  inline bool is_global() const
  { return is_global_; }

  /**
   * Returns true if the pattern says that the expression shall replace, not
   * search.
   * @return bool
   */
  inline bool is_replace() const
  { return is_replace_; }

public:

  /**
   * Quote a string
   * @param const str_t& s
   * @return str_t
   */
  inline static str_t quote(const str_t& s)
  { pcrecpp::RE::QuoteMeta(pcrecpp::StringPiece(s)); }

public:

  /**
   * Resets the object, clear all contents.
   * @return basic_pcre& *this
   */
  inline basic_pcre& clear()
  { pattern_ = srch_ = repl_ = flgs_ = error_ = ""; return *this; }

  /**
   * Sets the pattern to search/replace, parses the pattern components
   * and compiles the regex string. Does not explicitly throw exceptions,
   * but sets an error string fetchable using`error()`.
   * @param const str_t &pattern
   * @return basic_pcre& *this
   */
  basic_pcre& pattern(const str_t &pattern)
  {
    str_t pt, op, rp; // pattern, options, replace
    char sep; // separator
    bool is_rp = false, is_match = false;
    clear();

    pattern_ = pt = pattern; // e.g. [sm]/^(.*?)$//[ig]

    if(pt.length()<3) { // e.g. "//" or "||"
      error_ = "Empty pattern";
      return *this;
    }

    // Optional first pattern characters 's', 'm'
    if(pt[0] == 's') { // definitely search replace, otherwise check
      is_rp = true;
      pt = pt.length()>1 ? pt.substr(1) : "";
    } else if(pt[0] == 'm') { // definitely match
      is_match = true;
      pt = pt.length()>1 ? pt.substr(1) : "";
    }
    // Pattern separator detection
    if(str_t(separators_).find(pt[0]) == str_t::npos) {
      error_ = "Invalid pattern (must start with one of the separators: ";
      error_ += separators_ + ")";
      return *this;
    }
    sep = pt[0];
    pt = pt.substr(1);

    const str_t aflags = "ixsmU!$X*g"; // allowed modifiers/flags/options
    unsigned k;
    for(k=pt.length()-1; k>1; k--) {
      if(pt[k] == sep) break;
      if(aflags.find_first_of(pt[k]) == str_t::npos) {
        error_ = str_t("Unknown modifier/option '") + pt[k] + "'";
        return *this;
      }
    }
    if(pt[k] != sep) {
      error_ = "Empty pattern";
      return *this;
    }

    if(k<pt.length()-1) op = pt.substr(k+1);
    pt = pt.substr(0, k);

    do {
      str_t pt1 = pt + "\\"; // Temporary \ to fit in tailing backslashes
      pt.clear(); pt.reserve(pt1.length()*2);

      // Search pattern: First unescaped character means "end of search",
      // except if explicitly defined only search using 'm' as first pattern
      // character.
      bool done = false;
      for(k=0; !done && k<pt1.length()-1; k++) {
        switch(pt1[k]) {
          case '\\':
            if(pt1[k+1]=='\\' || pt1[k+1]==sep) {
              pt.push_back(pt1[++k]);
            } else {
              pt.push_back(pt1[k]);
            }
            break;
          case '\n':
            // re-escape (depends on shell)
            pt += "\\n";
            break;
          case '\t':
            pt += "\\t";
            break;
          case '\r':
            pt += "\\r";
            break;
          default:
            if(!is_match && pt1[k]==sep) {
              done = true;
            } else { // user didn't escape the separator, try to "see it right".
              pt.push_back(pt1[k]);
            }
        }
      }

      if((k<pt1.length()-1 || done)) { // last char is "\"
        // Replace pattern: rest of it, unescaped separators ignored and used
        // as normal character, but escaping allowed.
        // References: $1... and \1... allowed.
        for(; k<pt1.length()-1; k++) {
          switch(pt1[k]) {
            case '\\':
              switch(pt1[k+1]) {
                case '\\': rp.push_back(pt1[++k]); break;
                case '$': rp.push_back(pt1[++k]); break;
                case '0': rp.push_back('\0'); ++k; break;
                case 'a': rp.push_back('\a'); ++k; break;
                case 'b': rp.push_back('\b'); ++k; break;
                case 'f': rp.push_back('\f'); ++k; break;
                case 'n': rp.push_back('\n'); ++k; break;
                case 'r': rp.push_back('\r'); ++k; break;
                case 't': rp.push_back('\t'); ++k; break;
                case 'v': rp.push_back('\v'); ++k; break;
                default:
                  rp.push_back( (pt1[k+1]==sep) ? pt1[++k] : pt1[k]);
              }
              break;
            case '$':
              rp += (std::isdigit(pt1[k+1])) ? "\\" : "$"; // replace $1 --> \1
              break;
            default:
              rp.push_back(pt1[k]);
          }
        }
      }
    } while(0);

    if(pt.empty()) {
      error_ = "Empty pattern";
      return *this;
    }

    pcrecpp::RE_Options opts;
    opts.set_match_limit(10000); // const for now.
    opts.set_match_limit_recursion(500); // const for now.
    opts.set_caseless((op.find('i') != str_t::npos));
    opts.set_utf8((op.find('U') == str_t::npos));
    opts.set_extended((op.find('x') != str_t::npos));
    opts.set_dotall((op.find('s') != str_t::npos));
    opts.set_multiline((op.find('m') != str_t::npos));
    opts.set_ungreedy((op.find('!') != str_t::npos));
    opts.set_dollar_endonly((op.find('$') != str_t::npos));
    opts.set_extra((op.find('X') != str_t::npos));
    opts.set_no_auto_capture((op.find_first_of("*") != str_t::npos));
    re_ = pcrecpp::RE(pt, opts);
    is_global_ = is_rp && (op.find_first_of("g") != str_t::npos);
    is_replace_ = is_rp;
    srch_ = pt;
    repl_ = rp;
    flgs_ = op;
    if(!re_.error().empty()) error_ = re_.error();
    return *this;
  }

  /**
   * Applies the search/replace pattern regex to the given string.
   * THE STRING WILL BE MODIFIED.
   * @param str_t & subject
   * @return basic_pcre& *this
   */
  basic_pcre& apply_to(str_t &subject)
  {
    if(!error_.empty()) return *this;
    if(srch_.empty()) { error_ = "No pattern given to search/replace."; return *this; }

    if(is_replace_) {
      if(is_global_) {
        re_.GlobalReplace(repl_, &subject);
      } else {
        re_.Replace(repl_, &subject);
      }
    } else {
      str_t out;
      str_t rep = repl_.empty() ? str_t("\\0") : repl_;
      re_.Extract(rep, subject, &out);


      subject = out;
    }
    if(!re_.error().empty()) {
      error_ = re_.error();
      subject = "";
    }
    return *this;
  }

  /**
   * Applies the search/replace pattern regex to the given string and returns the result.
   * THE STRING WILL BE MODIFIED.
   * @param const str_t& subject
   * @return str_t
   */
  str_t operator () (const str_t& subject)
  { str_t s=subject; apply_to(s); if(!ok()) s=""; return s; }

protected:

  bool is_replace_;
  bool is_global_;
  typename str_t::value_type sep_;
  str_t separators_; // Allowed separators
  str_t pattern_;    // The whole pattern given
  str_t srch_;
  str_t flgs_;
  str_t repl_;
  str_t error_;
  pcrecpp::RE re_;   // PCRE main object

};

/**
 * ostream <<
 * @param std::basic_ostream<typename str_t::value_type>& os
 * @param const basic_pcre<str_t> re
 * @return std::basic_ostream<typename str_t::value_type>&
 */
template <typename str_t>
std::basic_ostream<typename str_t::value_type>& operator << (
   std::basic_ostream<typename str_t::value_type>& os,
   const basic_pcre<str_t>& re
)
{
  #define nl std::endl
  os << nl << "{" << nl
     << " - ok: " << (re.ok() ? "yes" : "no") << nl
     << " - pattern: \"" << re.pattern() << "\"" << nl;
  if(re.is_replace()) {
    os << " - search: \""  << re.search() << "\"" << nl;
  } else {
    os << " - match: \""  << re.search() << "\"" << nl;
  }
  if(!re.replace().empty()) {
    os << " - replace: \"" << re.replace() << "\"" << nl;
  }
  os << " - modifiers: \"" <<  re.modifiers() << "\"," << nl;
  if(re.is_global()) {
    os << "   - global: replace all matches (`g`)" << nl;
  } else {
    os << "   - not global: replace only first match (no `g`)" << nl;
  }
  if(re.modifiers().find('i') != str_t::npos) {
    os << "   - case insensitive matching (`i`)" << nl;
  } else {
    os << "   - case sensitive matching (no `i`)" << nl;
  }
  if(re.modifiers().find('x') != str_t::npos) {
    os << "   - whitespaces and commente in pattern permitted (`x`)" << nl;
  } else {
    os << "   - comments/unmatched spaces in pattern not permitted (no `x`)" << nl;
  }
  if(re.modifiers().find('m') != str_t::npos) {
    os << "   - Multiline (^/$ match start/end of text) (`m`)" << nl;
  } else {
    os << "   - Line-by-line (^/$ match start/end of line) (no `m`)" << nl;
  }
  if(re.modifiers().find('s') != str_t::npos) {
    os << "   - `.` matches newlines as well (`s`)" << nl;
  } else {
    os << "   - `.` does not match newlines (no `s`)" << nl;
  }
  if(re.modifiers().find('$') != str_t::npos) {
    os << "   - `$` matches only at the end. (`$`)" << nl;
  }
  if(re.modifiers().find('!') != str_t::npos) {
    os << "   - Meaning of `*?` and `*` swapped. (`!`)" << nl;
  }
  if(re.modifiers().find('*') != str_t::npos) {
    os << "   - Sub pattern matching disabled (`*`)" << nl;
  }
  if(re.modifiers().find('X') != str_t::npos) {
    os << "   - (PCRE:) Extra strict pattern escape parsing. (`X`)" << nl;
  }
  if(re.modifiers().find('U') != str_t::npos) {
    os << "   - UTF support disabled. (`U`)" << nl;
  } else {
    os << "   - UTF support enabled. (no `U`)" << nl;
  }
  os << "}" << nl;
  return os;
  #undef nl
}

}}

namespace sw {
  typedef detail::basic_pcre<std::string> pcre_regex;
}
#endif
