Terminators.php 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227
  1. <?php
  2. namespace Highlight;
  3. /**
  4. * @internal
  5. *
  6. * @since 9.16.0.0
  7. */
  8. final class Terminators
  9. {
  10. private $caseInsensitive;
  11. /** @var array<int, Mode> */
  12. private $matchIndexes = array();
  13. /** @var RegEx|null */
  14. private $matcherRe = null;
  15. /** @var array<int, array<int, Mode|string>> */
  16. private $regexes = array();
  17. /** @var int */
  18. private $matchAt = 1;
  19. /** @var Mode */
  20. private $mode;
  21. /** @var int */
  22. public $lastIndex = 0;
  23. public function __construct($caseInsensitive)
  24. {
  25. $this->caseInsensitive = $caseInsensitive;
  26. }
  27. /**
  28. * @internal
  29. *
  30. * @param $mode
  31. *
  32. * @return self
  33. */
  34. public function _buildModeRegex($mode)
  35. {
  36. $this->mode = $mode;
  37. $term = null;
  38. for ($i = 0; $i < count($mode->contains); ++$i) {
  39. $re = null;
  40. $term = $mode->contains[$i];
  41. if ($term->beginKeywords) {
  42. $re = "\.?(?:" . $term->begin . ")\.?";
  43. } else {
  44. $re = $term->begin;
  45. }
  46. $this->addRule($term, $re);
  47. }
  48. if ($mode->terminator_end) {
  49. $this->addRule('end', $mode->terminator_end);
  50. }
  51. if ($mode->illegal) {
  52. $this->addRule('illegal', $mode->illegal);
  53. }
  54. $terminators = array();
  55. foreach ($this->regexes as $regex) {
  56. $terminators[] = $regex[1];
  57. }
  58. $this->matcherRe = $this->langRe($this->joinRe($terminators, '|'), true);
  59. $this->lastIndex = 0;
  60. return $this;
  61. }
  62. public function exec($s)
  63. {
  64. if (count($this->regexes) === 0) {
  65. return null;
  66. }
  67. $this->matcherRe->lastIndex = $this->lastIndex;
  68. $match = $this->matcherRe->exec($s);
  69. if (!$match) {
  70. return null;
  71. }
  72. $rule = null;
  73. for ($i = 0; $i < count($match); ++$i) {
  74. if ($match[$i] !== null && isset($this->matchIndexes[$i])) {
  75. $rule = $this->matchIndexes[$i];
  76. break;
  77. }
  78. }
  79. if (is_string($rule)) {
  80. $match->type = $rule;
  81. $match->extra = array($this->mode->illegal, $this->mode->terminator_end);
  82. } else {
  83. $match->type = "begin";
  84. $match->rule = $rule;
  85. }
  86. return $match;
  87. }
  88. /**
  89. * @param string $value
  90. * @param bool $global
  91. *
  92. * @return RegEx
  93. */
  94. private function langRe($value, $global = false)
  95. {
  96. return RegExUtils::langRe($value, $global, $this->caseInsensitive);
  97. }
  98. /**
  99. * @param Mode|string $rule
  100. * @param string $regex
  101. */
  102. private function addRule($rule, $regex)
  103. {
  104. $this->matchIndexes[$this->matchAt] = $rule;
  105. $this->regexes[] = array($rule, $regex);
  106. $this->matchAt += $this->reCountMatchGroups($regex) + 1;
  107. }
  108. /**
  109. * joinRe logically computes regexps.join(separator), but fixes the
  110. * backreferences so they continue to match.
  111. *
  112. * it also places each individual regular expression into it's own
  113. * match group, keeping track of the sequencing of those match groups
  114. * is currently an exercise for the caller. :-)
  115. *
  116. * @param array $regexps
  117. * @param string $separator
  118. *
  119. * @return string
  120. */
  121. private function joinRe($regexps, $separator)
  122. {
  123. // backreferenceRe matches an open parenthesis or backreference. To avoid
  124. // an incorrect parse, it additionally matches the following:
  125. // - [...] elements, where the meaning of parentheses and escapes change
  126. // - other escape sequences, so we do not misparse escape sequences as
  127. // interesting elements
  128. // - non-matching or lookahead parentheses, which do not capture. These
  129. // follow the '(' with a '?'.
  130. $backreferenceRe = '#\[(?:[^\\\\\]]|\\\.)*\]|\(\??|\\\([1-9][0-9]*)|\\\.#';
  131. $numCaptures = 0;
  132. $ret = '';
  133. $strLen = count($regexps);
  134. for ($i = 0; $i < $strLen; ++$i) {
  135. ++$numCaptures;
  136. $offset = $numCaptures;
  137. $re = $this->reStr($regexps[$i]);
  138. if ($i > 0) {
  139. $ret .= $separator;
  140. }
  141. $ret .= "(";
  142. while (strlen($re) > 0) {
  143. $matches = array();
  144. $matchFound = preg_match($backreferenceRe, $re, $matches, PREG_OFFSET_CAPTURE);
  145. if ($matchFound === 0) {
  146. $ret .= $re;
  147. break;
  148. }
  149. // PHP aliases to match the JS naming conventions
  150. $match = $matches[0];
  151. $index = $match[1];
  152. $ret .= substr($re, 0, $index);
  153. $re = substr($re, $index + strlen($match[0]));
  154. if (substr($match[0], 0, 1) === '\\' && isset($matches[1])) {
  155. // Adjust the backreference.
  156. $ret .= "\\" . strval(intval($matches[1][0]) + $offset);
  157. } else {
  158. $ret .= $match[0];
  159. if ($match[0] == "(") {
  160. ++$numCaptures;
  161. }
  162. }
  163. }
  164. $ret .= ")";
  165. }
  166. return $ret;
  167. }
  168. private function reStr($re)
  169. {
  170. if ($re && isset($re->source)) {
  171. return $re->source;
  172. }
  173. return $re;
  174. }
  175. /**
  176. * @param RegEx|string $re
  177. *
  178. * @return int
  179. */
  180. private function reCountMatchGroups($re)
  181. {
  182. $results = array();
  183. $escaped = preg_replace('#(?<!\\\)/#um', '\\/', (string) $re);
  184. preg_match_all("/{$escaped}|/u", '', $results);
  185. return count($results) - 1;
  186. }
  187. }