Terminators.php 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245
  1. <?php
  2. namespace Highlight;
  3. /**
  4. * @internal
  5. *
  6. * @since 9.16.0.0
  7. */
  8. final class Terminators
  9. {
  10. /** @var bool */
  11. private $caseInsensitive;
  12. /** @var array<int, Mode|string> */
  13. private $matchIndexes = array();
  14. /** @var RegEx|null */
  15. private $matcherRe = null;
  16. /** @var array<int, array<int, Mode|string>> */
  17. private $regexes = array();
  18. /** @var int */
  19. private $matchAt = 1;
  20. /** @var Mode */
  21. private $mode;
  22. /** @var int */
  23. public $lastIndex = 0;
  24. /**
  25. * @param bool $caseInsensitive
  26. */
  27. public function __construct($caseInsensitive)
  28. {
  29. $this->caseInsensitive = $caseInsensitive;
  30. }
  31. /**
  32. * @internal
  33. *
  34. * @param Mode $mode
  35. *
  36. * @return self
  37. */
  38. public function _buildModeRegex($mode)
  39. {
  40. $this->mode = $mode;
  41. $term = null;
  42. for ($i = 0; $i < count($mode->contains); ++$i) {
  43. $re = null;
  44. $term = $mode->contains[$i];
  45. if ($term->beginKeywords) {
  46. $re = "\.?(?:" . $term->begin . ")\.?";
  47. } else {
  48. $re = $term->begin;
  49. }
  50. $this->addRule($term, $re);
  51. }
  52. if ($mode->terminator_end) {
  53. $this->addRule('end', $mode->terminator_end);
  54. }
  55. if ($mode->illegal) {
  56. $this->addRule('illegal', $mode->illegal);
  57. }
  58. /** @var array<int, string> $terminators */
  59. $terminators = array();
  60. foreach ($this->regexes as $regex) {
  61. $terminators[] = $regex[1];
  62. }
  63. $this->matcherRe = $this->langRe($this->joinRe($terminators, '|'), true);
  64. $this->lastIndex = 0;
  65. return $this;
  66. }
  67. /**
  68. * @param string $s
  69. *
  70. * @return RegExMatch|null
  71. */
  72. public function exec($s)
  73. {
  74. if (count($this->regexes) === 0) {
  75. return null;
  76. }
  77. $this->matcherRe->lastIndex = $this->lastIndex;
  78. $match = $this->matcherRe->exec($s);
  79. if (!$match) {
  80. return null;
  81. }
  82. /** @var Mode|string $rule */
  83. $rule = null;
  84. for ($i = 0; $i < count($match); ++$i) {
  85. if ($match[$i] !== null && isset($this->matchIndexes[$i])) {
  86. $rule = $this->matchIndexes[$i];
  87. break;
  88. }
  89. }
  90. if (is_string($rule)) {
  91. $match->type = $rule;
  92. $match->extra = array($this->mode->illegal, $this->mode->terminator_end);
  93. } else {
  94. $match->type = "begin";
  95. $match->rule = $rule;
  96. }
  97. return $match;
  98. }
  99. /**
  100. * @param string $value
  101. * @param bool $global
  102. *
  103. * @return RegEx
  104. */
  105. private function langRe($value, $global = false)
  106. {
  107. return RegExUtils::langRe($value, $global, $this->caseInsensitive);
  108. }
  109. /**
  110. * @param Mode|string $rule
  111. * @param string $regex
  112. *
  113. * @return void
  114. */
  115. private function addRule($rule, $regex)
  116. {
  117. $this->matchIndexes[$this->matchAt] = $rule;
  118. $this->regexes[] = array($rule, $regex);
  119. $this->matchAt += $this->reCountMatchGroups($regex) + 1;
  120. }
  121. /**
  122. * joinRe logically computes regexps.join(separator), but fixes the
  123. * backreferences so they continue to match.
  124. *
  125. * it also places each individual regular expression into it's own
  126. * match group, keeping track of the sequencing of those match groups
  127. * is currently an exercise for the caller. :-)
  128. *
  129. * @param array<int, string> $regexps
  130. * @param string $separator
  131. *
  132. * @return string
  133. */
  134. private function joinRe($regexps, $separator)
  135. {
  136. // backreferenceRe matches an open parenthesis or backreference. To avoid
  137. // an incorrect parse, it additionally matches the following:
  138. // - [...] elements, where the meaning of parentheses and escapes change
  139. // - other escape sequences, so we do not misparse escape sequences as
  140. // interesting elements
  141. // - non-matching or lookahead parentheses, which do not capture. These
  142. // follow the '(' with a '?'.
  143. $backreferenceRe = '#\[(?:[^\\\\\]]|\\\.)*\]|\(\??|\\\([1-9][0-9]*)|\\\.#';
  144. $numCaptures = 0;
  145. $ret = '';
  146. $strLen = count($regexps);
  147. for ($i = 0; $i < $strLen; ++$i) {
  148. ++$numCaptures;
  149. $offset = $numCaptures;
  150. $re = $this->reStr($regexps[$i]);
  151. if ($i > 0) {
  152. $ret .= $separator;
  153. }
  154. $ret .= "(";
  155. while (strlen($re) > 0) {
  156. $matches = array();
  157. $matchFound = preg_match($backreferenceRe, $re, $matches, PREG_OFFSET_CAPTURE);
  158. if ($matchFound === 0) {
  159. $ret .= $re;
  160. break;
  161. }
  162. // PHP aliases to match the JS naming conventions
  163. $match = $matches[0];
  164. $index = $match[1];
  165. $ret .= substr($re, 0, $index);
  166. $re = substr($re, $index + strlen($match[0]));
  167. if (substr($match[0], 0, 1) === '\\' && isset($matches[1])) {
  168. // Adjust the backreference.
  169. $ret .= "\\" . strval(intval($matches[1][0]) + $offset);
  170. } else {
  171. $ret .= $match[0];
  172. if ($match[0] == "(") {
  173. ++$numCaptures;
  174. }
  175. }
  176. }
  177. $ret .= ")";
  178. }
  179. return $ret;
  180. }
  181. /**
  182. * @param RegEx|string $re
  183. *
  184. * @return mixed
  185. */
  186. private function reStr($re)
  187. {
  188. if ($re && isset($re->source)) {
  189. return $re->source;
  190. }
  191. return $re;
  192. }
  193. /**
  194. * @param RegEx|string $re
  195. *
  196. * @return int
  197. */
  198. private function reCountMatchGroups($re)
  199. {
  200. $results = array();
  201. $escaped = preg_replace('#(?<!\\\)/#um', '\\/', (string) $re);
  202. preg_match_all("/{$escaped}|/u", '', $results);
  203. return count($results) - 1;
  204. }
  205. }