123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227 |
- <?php
- namespace Highlight;
- /**
- * @internal
- *
- * @since 9.16.0.0
- */
- final class Terminators
- {
- private $caseInsensitive;
- /** @var array<int, Mode> */
- private $matchIndexes = array();
- /** @var RegEx|null */
- private $matcherRe = null;
- /** @var array<int, array<int, Mode|string>> */
- private $regexes = array();
- /** @var int */
- private $matchAt = 1;
- /** @var Mode */
- private $mode;
- /** @var int */
- public $lastIndex = 0;
- public function __construct($caseInsensitive)
- {
- $this->caseInsensitive = $caseInsensitive;
- }
- /**
- * @internal
- *
- * @param $mode
- *
- * @return self
- */
- public function _buildModeRegex($mode)
- {
- $this->mode = $mode;
- $term = null;
- for ($i = 0; $i < count($mode->contains); ++$i) {
- $re = null;
- $term = $mode->contains[$i];
- if ($term->beginKeywords) {
- $re = "\.?(?:" . $term->begin . ")\.?";
- } else {
- $re = $term->begin;
- }
- $this->addRule($term, $re);
- }
- if ($mode->terminator_end) {
- $this->addRule('end', $mode->terminator_end);
- }
- if ($mode->illegal) {
- $this->addRule('illegal', $mode->illegal);
- }
- $terminators = array();
- foreach ($this->regexes as $regex) {
- $terminators[] = $regex[1];
- }
- $this->matcherRe = $this->langRe($this->joinRe($terminators, '|'), true);
- $this->lastIndex = 0;
- return $this;
- }
- public function exec($s)
- {
- if (count($this->regexes) === 0) {
- return null;
- }
- $this->matcherRe->lastIndex = $this->lastIndex;
- $match = $this->matcherRe->exec($s);
- if (!$match) {
- return null;
- }
- $rule = null;
- for ($i = 0; $i < count($match); ++$i) {
- if ($match[$i] !== null && isset($this->matchIndexes[$i])) {
- $rule = $this->matchIndexes[$i];
- break;
- }
- }
- if (is_string($rule)) {
- $match->type = $rule;
- $match->extra = array($this->mode->illegal, $this->mode->terminator_end);
- } else {
- $match->type = "begin";
- $match->rule = $rule;
- }
- return $match;
- }
- /**
- * @param string $value
- * @param bool $global
- *
- * @return RegEx
- */
- private function langRe($value, $global = false)
- {
- return RegExUtils::langRe($value, $global, $this->caseInsensitive);
- }
- /**
- * @param Mode|string $rule
- * @param string $regex
- */
- private function addRule($rule, $regex)
- {
- $this->matchIndexes[$this->matchAt] = $rule;
- $this->regexes[] = array($rule, $regex);
- $this->matchAt += $this->reCountMatchGroups($regex) + 1;
- }
- /**
- * joinRe logically computes regexps.join(separator), but fixes the
- * backreferences so they continue to match.
- *
- * it also places each individual regular expression into it's own
- * match group, keeping track of the sequencing of those match groups
- * is currently an exercise for the caller. :-)
- *
- * @param array $regexps
- * @param string $separator
- *
- * @return string
- */
- private function joinRe($regexps, $separator)
- {
- // backreferenceRe matches an open parenthesis or backreference. To avoid
- // an incorrect parse, it additionally matches the following:
- // - [...] elements, where the meaning of parentheses and escapes change
- // - other escape sequences, so we do not misparse escape sequences as
- // interesting elements
- // - non-matching or lookahead parentheses, which do not capture. These
- // follow the '(' with a '?'.
- $backreferenceRe = '#\[(?:[^\\\\\]]|\\\.)*\]|\(\??|\\\([1-9][0-9]*)|\\\.#';
- $numCaptures = 0;
- $ret = '';
- $strLen = count($regexps);
- for ($i = 0; $i < $strLen; ++$i) {
- ++$numCaptures;
- $offset = $numCaptures;
- $re = $this->reStr($regexps[$i]);
- if ($i > 0) {
- $ret .= $separator;
- }
- $ret .= "(";
- while (strlen($re) > 0) {
- $matches = array();
- $matchFound = preg_match($backreferenceRe, $re, $matches, PREG_OFFSET_CAPTURE);
- if ($matchFound === 0) {
- $ret .= $re;
- break;
- }
- // PHP aliases to match the JS naming conventions
- $match = $matches[0];
- $index = $match[1];
- $ret .= substr($re, 0, $index);
- $re = substr($re, $index + strlen($match[0]));
- if (substr($match[0], 0, 1) === '\\' && isset($matches[1])) {
- // Adjust the backreference.
- $ret .= "\\" . strval(intval($matches[1][0]) + $offset);
- } else {
- $ret .= $match[0];
- if ($match[0] == "(") {
- ++$numCaptures;
- }
- }
- }
- $ret .= ")";
- }
- return $ret;
- }
- private function reStr($re)
- {
- if ($re && isset($re->source)) {
- return $re->source;
- }
- return $re;
- }
- /**
- * @param RegEx|string $re
- *
- * @return int
- */
- private function reCountMatchGroups($re)
- {
- $results = array();
- $escaped = preg_replace('#(?<!\\\)/#um', '\\/', (string) $re);
- preg_match_all("/{$escaped}|/u", '', $results);
- return count($results) - 1;
- }
- }
|