Crawler.php 37 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229
  1. <?php
  2. /*
  3. * This file is part of the Symfony package.
  4. *
  5. * (c) Fabien Potencier <fabien@symfony.com>
  6. *
  7. * For the full copyright and license information, please view the LICENSE
  8. * file that was distributed with this source code.
  9. */
  10. namespace Symfony\Component\DomCrawler;
  11. use Masterminds\HTML5;
  12. use Symfony\Component\CssSelector\CssSelectorConverter;
  13. /**
  14. * Crawler eases navigation of a list of \DOMNode objects.
  15. *
  16. * @author Fabien Potencier <fabien@symfony.com>
  17. */
  18. class Crawler implements \Countable, \IteratorAggregate
  19. {
  20. protected $uri;
  21. /**
  22. * @var string The default namespace prefix to be used with XPath and CSS expressions
  23. */
  24. private $defaultNamespacePrefix = 'default';
  25. /**
  26. * @var array A map of manually registered namespaces
  27. */
  28. private $namespaces = [];
  29. /**
  30. * @var string The base href value
  31. */
  32. private $baseHref;
  33. /**
  34. * @var \DOMDocument|null
  35. */
  36. private $document;
  37. /**
  38. * @var \DOMNode[]
  39. */
  40. private $nodes = [];
  41. /**
  42. * Whether the Crawler contains HTML or XML content (used when converting CSS to XPath).
  43. *
  44. * @var bool
  45. */
  46. private $isHtml = true;
  47. /**
  48. * @var HTML5|null
  49. */
  50. private $html5Parser;
  51. /**
  52. * @param \DOMNodeList|\DOMNode|\DOMNode[]|string|null $node A Node to use as the base for the crawling
  53. */
  54. public function __construct($node = null, string $uri = null, string $baseHref = null)
  55. {
  56. $this->uri = $uri;
  57. $this->baseHref = $baseHref ?: $uri;
  58. $this->html5Parser = class_exists(HTML5::class) ? new HTML5(['disable_html_ns' => true]) : null;
  59. $this->add($node);
  60. }
  61. /**
  62. * Returns the current URI.
  63. *
  64. * @return string
  65. */
  66. public function getUri()
  67. {
  68. return $this->uri;
  69. }
  70. /**
  71. * Returns base href.
  72. *
  73. * @return string
  74. */
  75. public function getBaseHref()
  76. {
  77. return $this->baseHref;
  78. }
  79. /**
  80. * Removes all the nodes.
  81. */
  82. public function clear()
  83. {
  84. $this->nodes = [];
  85. $this->document = null;
  86. }
  87. /**
  88. * Adds a node to the current list of nodes.
  89. *
  90. * This method uses the appropriate specialized add*() method based
  91. * on the type of the argument.
  92. *
  93. * @param \DOMNodeList|\DOMNode|\DOMNode[]|string|null $node A node
  94. *
  95. * @throws \InvalidArgumentException when node is not the expected type
  96. */
  97. public function add($node)
  98. {
  99. if ($node instanceof \DOMNodeList) {
  100. $this->addNodeList($node);
  101. } elseif ($node instanceof \DOMNode) {
  102. $this->addNode($node);
  103. } elseif (\is_array($node)) {
  104. $this->addNodes($node);
  105. } elseif (\is_string($node)) {
  106. $this->addContent($node);
  107. } elseif (null !== $node) {
  108. throw new \InvalidArgumentException(sprintf('Expecting a DOMNodeList or DOMNode instance, an array, a string, or null, but got "%s".', \is_object($node) ? \get_class($node) : \gettype($node)));
  109. }
  110. }
  111. /**
  112. * Adds HTML/XML content.
  113. *
  114. * If the charset is not set via the content type, it is assumed to be UTF-8,
  115. * or ISO-8859-1 as a fallback, which is the default charset defined by the
  116. * HTTP 1.1 specification.
  117. */
  118. public function addContent(string $content, string $type = null)
  119. {
  120. if (empty($type)) {
  121. $type = 0 === strpos($content, '<?xml') ? 'application/xml' : 'text/html';
  122. }
  123. // DOM only for HTML/XML content
  124. if (!preg_match('/(x|ht)ml/i', $type, $xmlMatches)) {
  125. return;
  126. }
  127. $charset = null;
  128. if (false !== $pos = stripos($type, 'charset=')) {
  129. $charset = substr($type, $pos + 8);
  130. if (false !== $pos = strpos($charset, ';')) {
  131. $charset = substr($charset, 0, $pos);
  132. }
  133. }
  134. // http://www.w3.org/TR/encoding/#encodings
  135. // http://www.w3.org/TR/REC-xml/#NT-EncName
  136. if (null === $charset &&
  137. preg_match('/\<meta[^\>]+charset *= *["\']?([a-zA-Z\-0-9_:.]+)/i', $content, $matches)) {
  138. $charset = $matches[1];
  139. }
  140. if (null === $charset) {
  141. $charset = preg_match('//u', $content) ? 'UTF-8' : 'ISO-8859-1';
  142. }
  143. if ('x' === $xmlMatches[1]) {
  144. $this->addXmlContent($content, $charset);
  145. } else {
  146. $this->addHtmlContent($content, $charset);
  147. }
  148. }
  149. /**
  150. * Adds an HTML content to the list of nodes.
  151. *
  152. * The libxml errors are disabled when the content is parsed.
  153. *
  154. * If you want to get parsing errors, be sure to enable
  155. * internal errors via libxml_use_internal_errors(true)
  156. * and then, get the errors via libxml_get_errors(). Be
  157. * sure to clear errors with libxml_clear_errors() afterward.
  158. */
  159. public function addHtmlContent(string $content, string $charset = 'UTF-8')
  160. {
  161. // Use HTML5 parser if the content is HTML5 and the library is available
  162. $dom = null !== $this->html5Parser && strspn($content, " \t\r\n") === stripos($content, '<!doctype html>') ? $this->parseHtml5($content, $charset) : $this->parseXhtml($content, $charset);
  163. $this->addDocument($dom);
  164. $base = $this->filterRelativeXPath('descendant-or-self::base')->extract(['href']);
  165. $baseHref = current($base);
  166. if (\count($base) && !empty($baseHref)) {
  167. if ($this->baseHref) {
  168. $linkNode = $dom->createElement('a');
  169. $linkNode->setAttribute('href', $baseHref);
  170. $link = new Link($linkNode, $this->baseHref);
  171. $this->baseHref = $link->getUri();
  172. } else {
  173. $this->baseHref = $baseHref;
  174. }
  175. }
  176. }
  177. /**
  178. * Adds an XML content to the list of nodes.
  179. *
  180. * The libxml errors are disabled when the content is parsed.
  181. *
  182. * If you want to get parsing errors, be sure to enable
  183. * internal errors via libxml_use_internal_errors(true)
  184. * and then, get the errors via libxml_get_errors(). Be
  185. * sure to clear errors with libxml_clear_errors() afterward.
  186. *
  187. * @param int $options Bitwise OR of the libxml option constants
  188. * LIBXML_PARSEHUGE is dangerous, see
  189. * http://symfony.com/blog/security-release-symfony-2-0-17-released
  190. */
  191. public function addXmlContent(string $content, string $charset = 'UTF-8', int $options = LIBXML_NONET)
  192. {
  193. // remove the default namespace if it's the only namespace to make XPath expressions simpler
  194. if (!preg_match('/xmlns:/', $content)) {
  195. $content = str_replace('xmlns', 'ns', $content);
  196. }
  197. $internalErrors = libxml_use_internal_errors(true);
  198. $disableEntities = libxml_disable_entity_loader(true);
  199. $dom = new \DOMDocument('1.0', $charset);
  200. $dom->validateOnParse = true;
  201. if ('' !== trim($content)) {
  202. @$dom->loadXML($content, $options);
  203. }
  204. libxml_use_internal_errors($internalErrors);
  205. libxml_disable_entity_loader($disableEntities);
  206. $this->addDocument($dom);
  207. $this->isHtml = false;
  208. }
  209. /**
  210. * Adds a \DOMDocument to the list of nodes.
  211. *
  212. * @param \DOMDocument $dom A \DOMDocument instance
  213. */
  214. public function addDocument(\DOMDocument $dom)
  215. {
  216. if ($dom->documentElement) {
  217. $this->addNode($dom->documentElement);
  218. }
  219. }
  220. /**
  221. * Adds a \DOMNodeList to the list of nodes.
  222. *
  223. * @param \DOMNodeList $nodes A \DOMNodeList instance
  224. */
  225. public function addNodeList(\DOMNodeList $nodes)
  226. {
  227. foreach ($nodes as $node) {
  228. if ($node instanceof \DOMNode) {
  229. $this->addNode($node);
  230. }
  231. }
  232. }
  233. /**
  234. * Adds an array of \DOMNode instances to the list of nodes.
  235. *
  236. * @param \DOMNode[] $nodes An array of \DOMNode instances
  237. */
  238. public function addNodes(array $nodes)
  239. {
  240. foreach ($nodes as $node) {
  241. $this->add($node);
  242. }
  243. }
  244. /**
  245. * Adds a \DOMNode instance to the list of nodes.
  246. *
  247. * @param \DOMNode $node A \DOMNode instance
  248. */
  249. public function addNode(\DOMNode $node)
  250. {
  251. if ($node instanceof \DOMDocument) {
  252. $node = $node->documentElement;
  253. }
  254. if (null !== $this->document && $this->document !== $node->ownerDocument) {
  255. throw new \InvalidArgumentException('Attaching DOM nodes from multiple documents in the same crawler is forbidden.');
  256. }
  257. if (null === $this->document) {
  258. $this->document = $node->ownerDocument;
  259. }
  260. // Don't add duplicate nodes in the Crawler
  261. if (\in_array($node, $this->nodes, true)) {
  262. return;
  263. }
  264. $this->nodes[] = $node;
  265. }
  266. /**
  267. * Returns a node given its position in the node list.
  268. *
  269. * @return static
  270. */
  271. public function eq(int $position)
  272. {
  273. if (isset($this->nodes[$position])) {
  274. return $this->createSubCrawler($this->nodes[$position]);
  275. }
  276. return $this->createSubCrawler(null);
  277. }
  278. /**
  279. * Calls an anonymous function on each node of the list.
  280. *
  281. * The anonymous function receives the position and the node wrapped
  282. * in a Crawler instance as arguments.
  283. *
  284. * Example:
  285. *
  286. * $crawler->filter('h1')->each(function ($node, $i) {
  287. * return $node->text();
  288. * });
  289. *
  290. * @param \Closure $closure An anonymous function
  291. *
  292. * @return array An array of values returned by the anonymous function
  293. */
  294. public function each(\Closure $closure)
  295. {
  296. $data = [];
  297. foreach ($this->nodes as $i => $node) {
  298. $data[] = $closure($this->createSubCrawler($node), $i);
  299. }
  300. return $data;
  301. }
  302. /**
  303. * Slices the list of nodes by $offset and $length.
  304. *
  305. * @return static
  306. */
  307. public function slice(int $offset = 0, int $length = null)
  308. {
  309. return $this->createSubCrawler(\array_slice($this->nodes, $offset, $length));
  310. }
  311. /**
  312. * Reduces the list of nodes by calling an anonymous function.
  313. *
  314. * To remove a node from the list, the anonymous function must return false.
  315. *
  316. * @param \Closure $closure An anonymous function
  317. *
  318. * @return static
  319. */
  320. public function reduce(\Closure $closure)
  321. {
  322. $nodes = [];
  323. foreach ($this->nodes as $i => $node) {
  324. if (false !== $closure($this->createSubCrawler($node), $i)) {
  325. $nodes[] = $node;
  326. }
  327. }
  328. return $this->createSubCrawler($nodes);
  329. }
  330. /**
  331. * Returns the first node of the current selection.
  332. *
  333. * @return static
  334. */
  335. public function first()
  336. {
  337. return $this->eq(0);
  338. }
  339. /**
  340. * Returns the last node of the current selection.
  341. *
  342. * @return static
  343. */
  344. public function last()
  345. {
  346. return $this->eq(\count($this->nodes) - 1);
  347. }
  348. /**
  349. * Returns the siblings nodes of the current selection.
  350. *
  351. * @return static
  352. *
  353. * @throws \InvalidArgumentException When current node is empty
  354. */
  355. public function siblings()
  356. {
  357. if (!$this->nodes) {
  358. throw new \InvalidArgumentException('The current node list is empty.');
  359. }
  360. return $this->createSubCrawler($this->sibling($this->getNode(0)->parentNode->firstChild));
  361. }
  362. public function matches(string $selector): bool
  363. {
  364. if (!$this->nodes) {
  365. return false;
  366. }
  367. $converter = $this->createCssSelectorConverter();
  368. $xpath = $converter->toXPath($selector, 'self::');
  369. return 0 !== $this->filterRelativeXPath($xpath)->count();
  370. }
  371. /**
  372. * Return first parents (heading toward the document root) of the Element that matches the provided selector.
  373. *
  374. * @see https://developer.mozilla.org/en-US/docs/Web/API/Element/closest#Polyfill
  375. *
  376. * @throws \InvalidArgumentException When current node is empty
  377. */
  378. public function closest(string $selector): ?self
  379. {
  380. if (!$this->nodes) {
  381. throw new \InvalidArgumentException('The current node list is empty.');
  382. }
  383. $domNode = $this->getNode(0);
  384. while (XML_ELEMENT_NODE === $domNode->nodeType) {
  385. $node = $this->createSubCrawler($domNode);
  386. if ($node->matches($selector)) {
  387. return $node;
  388. }
  389. $domNode = $node->getNode(0)->parentNode;
  390. }
  391. return null;
  392. }
  393. /**
  394. * Returns the next siblings nodes of the current selection.
  395. *
  396. * @return static
  397. *
  398. * @throws \InvalidArgumentException When current node is empty
  399. */
  400. public function nextAll()
  401. {
  402. if (!$this->nodes) {
  403. throw new \InvalidArgumentException('The current node list is empty.');
  404. }
  405. return $this->createSubCrawler($this->sibling($this->getNode(0)));
  406. }
  407. /**
  408. * Returns the previous sibling nodes of the current selection.
  409. *
  410. * @return static
  411. *
  412. * @throws \InvalidArgumentException
  413. */
  414. public function previousAll()
  415. {
  416. if (!$this->nodes) {
  417. throw new \InvalidArgumentException('The current node list is empty.');
  418. }
  419. return $this->createSubCrawler($this->sibling($this->getNode(0), 'previousSibling'));
  420. }
  421. /**
  422. * Returns the parents nodes of the current selection.
  423. *
  424. * @return static
  425. *
  426. * @throws \InvalidArgumentException When current node is empty
  427. */
  428. public function parents()
  429. {
  430. if (!$this->nodes) {
  431. throw new \InvalidArgumentException('The current node list is empty.');
  432. }
  433. $node = $this->getNode(0);
  434. $nodes = [];
  435. while ($node = $node->parentNode) {
  436. if (XML_ELEMENT_NODE === $node->nodeType) {
  437. $nodes[] = $node;
  438. }
  439. }
  440. return $this->createSubCrawler($nodes);
  441. }
  442. /**
  443. * Returns the children nodes of the current selection.
  444. *
  445. * @return static
  446. *
  447. * @throws \InvalidArgumentException When current node is empty
  448. * @throws \RuntimeException If the CssSelector Component is not available and $selector is provided
  449. */
  450. public function children(string $selector = null)
  451. {
  452. if (!$this->nodes) {
  453. throw new \InvalidArgumentException('The current node list is empty.');
  454. }
  455. if (null !== $selector) {
  456. $converter = $this->createCssSelectorConverter();
  457. $xpath = $converter->toXPath($selector, 'child::');
  458. return $this->filterRelativeXPath($xpath);
  459. }
  460. $node = $this->getNode(0)->firstChild;
  461. return $this->createSubCrawler($node ? $this->sibling($node) : []);
  462. }
  463. /**
  464. * Returns the attribute value of the first node of the list.
  465. *
  466. * @return string|null The attribute value or null if the attribute does not exist
  467. *
  468. * @throws \InvalidArgumentException When current node is empty
  469. */
  470. public function attr(string $attribute)
  471. {
  472. if (!$this->nodes) {
  473. throw new \InvalidArgumentException('The current node list is empty.');
  474. }
  475. $node = $this->getNode(0);
  476. return $node->hasAttribute($attribute) ? $node->getAttribute($attribute) : null;
  477. }
  478. /**
  479. * Returns the node name of the first node of the list.
  480. *
  481. * @return string The node name
  482. *
  483. * @throws \InvalidArgumentException When current node is empty
  484. */
  485. public function nodeName()
  486. {
  487. if (!$this->nodes) {
  488. throw new \InvalidArgumentException('The current node list is empty.');
  489. }
  490. return $this->getNode(0)->nodeName;
  491. }
  492. /**
  493. * Returns the text of the first node of the list.
  494. *
  495. * Pass true as the second argument to normalize whitespaces.
  496. *
  497. * @param string|null $default When not null: the value to return when the current node is empty
  498. * @param bool $normalizeWhitespace Whether whitespaces should be trimmed and normalized to single spaces
  499. *
  500. * @return string The node value
  501. *
  502. * @throws \InvalidArgumentException When current node is empty
  503. */
  504. public function text(string $default = null, bool $normalizeWhitespace = true)
  505. {
  506. if (!$this->nodes) {
  507. if (null !== $default) {
  508. return $default;
  509. }
  510. throw new \InvalidArgumentException('The current node list is empty.');
  511. }
  512. $text = $this->getNode(0)->nodeValue;
  513. if ($normalizeWhitespace) {
  514. return trim(preg_replace('/(?:\s{2,}+|[^\S ])/', ' ', $text));
  515. }
  516. return $text;
  517. }
  518. /**
  519. * Returns the first node of the list as HTML.
  520. *
  521. * @param string|null $default When not null: the value to return when the current node is empty
  522. *
  523. * @return string The node html
  524. *
  525. * @throws \InvalidArgumentException When current node is empty
  526. */
  527. public function html(string $default = null)
  528. {
  529. if (!$this->nodes) {
  530. if (null !== $default) {
  531. return $default;
  532. }
  533. throw new \InvalidArgumentException('The current node list is empty.');
  534. }
  535. $node = $this->getNode(0);
  536. $owner = $node->ownerDocument;
  537. if (null !== $this->html5Parser && '<!DOCTYPE html>' === $owner->saveXML($owner->childNodes[0])) {
  538. $owner = $this->html5Parser;
  539. }
  540. $html = '';
  541. foreach ($node->childNodes as $child) {
  542. $html .= $owner->saveHTML($child);
  543. }
  544. return $html;
  545. }
  546. public function outerHtml(): string
  547. {
  548. if (!\count($this)) {
  549. throw new \InvalidArgumentException('The current node list is empty.');
  550. }
  551. $node = $this->getNode(0);
  552. $owner = $node->ownerDocument;
  553. if (null !== $this->html5Parser && '<!DOCTYPE html>' === $owner->saveXML($owner->childNodes[0])) {
  554. $owner = $this->html5Parser;
  555. }
  556. return $owner->saveHTML($node);
  557. }
  558. /**
  559. * Evaluates an XPath expression.
  560. *
  561. * Since an XPath expression might evaluate to either a simple type or a \DOMNodeList,
  562. * this method will return either an array of simple types or a new Crawler instance.
  563. *
  564. * @return array|Crawler An array of evaluation results or a new Crawler instance
  565. */
  566. public function evaluate(string $xpath)
  567. {
  568. if (null === $this->document) {
  569. throw new \LogicException('Cannot evaluate the expression on an uninitialized crawler.');
  570. }
  571. $data = [];
  572. $domxpath = $this->createDOMXPath($this->document, $this->findNamespacePrefixes($xpath));
  573. foreach ($this->nodes as $node) {
  574. $data[] = $domxpath->evaluate($xpath, $node);
  575. }
  576. if (isset($data[0]) && $data[0] instanceof \DOMNodeList) {
  577. return $this->createSubCrawler($data);
  578. }
  579. return $data;
  580. }
  581. /**
  582. * Extracts information from the list of nodes.
  583. *
  584. * You can extract attributes or/and the node value (_text).
  585. *
  586. * Example:
  587. *
  588. * $crawler->filter('h1 a')->extract(['_text', 'href']);
  589. *
  590. * @return array An array of extracted values
  591. */
  592. public function extract(array $attributes)
  593. {
  594. $count = \count($attributes);
  595. $data = [];
  596. foreach ($this->nodes as $node) {
  597. $elements = [];
  598. foreach ($attributes as $attribute) {
  599. if ('_text' === $attribute) {
  600. $elements[] = $node->nodeValue;
  601. } elseif ('_name' === $attribute) {
  602. $elements[] = $node->nodeName;
  603. } else {
  604. $elements[] = $node->getAttribute($attribute);
  605. }
  606. }
  607. $data[] = 1 === $count ? $elements[0] : $elements;
  608. }
  609. return $data;
  610. }
  611. /**
  612. * Filters the list of nodes with an XPath expression.
  613. *
  614. * The XPath expression is evaluated in the context of the crawler, which
  615. * is considered as a fake parent of the elements inside it.
  616. * This means that a child selector "div" or "./div" will match only
  617. * the div elements of the current crawler, not their children.
  618. *
  619. * @return static
  620. */
  621. public function filterXPath(string $xpath)
  622. {
  623. $xpath = $this->relativize($xpath);
  624. // If we dropped all expressions in the XPath while preparing it, there would be no match
  625. if ('' === $xpath) {
  626. return $this->createSubCrawler(null);
  627. }
  628. return $this->filterRelativeXPath($xpath);
  629. }
  630. /**
  631. * Filters the list of nodes with a CSS selector.
  632. *
  633. * This method only works if you have installed the CssSelector Symfony Component.
  634. *
  635. * @return static
  636. *
  637. * @throws \RuntimeException if the CssSelector Component is not available
  638. */
  639. public function filter(string $selector)
  640. {
  641. $converter = $this->createCssSelectorConverter();
  642. // The CssSelector already prefixes the selector with descendant-or-self::
  643. return $this->filterRelativeXPath($converter->toXPath($selector));
  644. }
  645. /**
  646. * Selects links by name or alt value for clickable images.
  647. *
  648. * @return static
  649. */
  650. public function selectLink(string $value)
  651. {
  652. return $this->filterRelativeXPath(
  653. sprintf('descendant-or-self::a[contains(concat(\' \', normalize-space(string(.)), \' \'), %1$s) or ./img[contains(concat(\' \', normalize-space(string(@alt)), \' \'), %1$s)]]', static::xpathLiteral(' '.$value.' '))
  654. );
  655. }
  656. /**
  657. * Selects images by alt value.
  658. *
  659. * @return static A new instance of Crawler with the filtered list of nodes
  660. */
  661. public function selectImage(string $value)
  662. {
  663. $xpath = sprintf('descendant-or-self::img[contains(normalize-space(string(@alt)), %s)]', static::xpathLiteral($value));
  664. return $this->filterRelativeXPath($xpath);
  665. }
  666. /**
  667. * Selects a button by name or alt value for images.
  668. *
  669. * @return static
  670. */
  671. public function selectButton(string $value)
  672. {
  673. return $this->filterRelativeXPath(
  674. sprintf('descendant-or-self::input[((contains(%1$s, "submit") or contains(%1$s, "button")) and contains(concat(\' \', normalize-space(string(@value)), \' \'), %2$s)) or (contains(%1$s, "image") and contains(concat(\' \', normalize-space(string(@alt)), \' \'), %2$s)) or @id=%3$s or @name=%3$s] | descendant-or-self::button[contains(concat(\' \', normalize-space(string(.)), \' \'), %2$s) or @id=%3$s or @name=%3$s]', 'translate(@type, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz")', static::xpathLiteral(' '.$value.' '), static::xpathLiteral($value))
  675. );
  676. }
  677. /**
  678. * Returns a Link object for the first node in the list.
  679. *
  680. * @return Link A Link instance
  681. *
  682. * @throws \InvalidArgumentException If the current node list is empty or the selected node is not instance of DOMElement
  683. */
  684. public function link(string $method = 'get')
  685. {
  686. if (!$this->nodes) {
  687. throw new \InvalidArgumentException('The current node list is empty.');
  688. }
  689. $node = $this->getNode(0);
  690. if (!$node instanceof \DOMElement) {
  691. throw new \InvalidArgumentException(sprintf('The selected node should be instance of DOMElement, got "%s".', \get_class($node)));
  692. }
  693. return new Link($node, $this->baseHref, $method);
  694. }
  695. /**
  696. * Returns an array of Link objects for the nodes in the list.
  697. *
  698. * @return Link[] An array of Link instances
  699. *
  700. * @throws \InvalidArgumentException If the current node list contains non-DOMElement instances
  701. */
  702. public function links()
  703. {
  704. $links = [];
  705. foreach ($this->nodes as $node) {
  706. if (!$node instanceof \DOMElement) {
  707. throw new \InvalidArgumentException(sprintf('The current node list should contain only DOMElement instances, "%s" found.', \get_class($node)));
  708. }
  709. $links[] = new Link($node, $this->baseHref, 'get');
  710. }
  711. return $links;
  712. }
  713. /**
  714. * Returns an Image object for the first node in the list.
  715. *
  716. * @return Image An Image instance
  717. *
  718. * @throws \InvalidArgumentException If the current node list is empty
  719. */
  720. public function image()
  721. {
  722. if (!\count($this)) {
  723. throw new \InvalidArgumentException('The current node list is empty.');
  724. }
  725. $node = $this->getNode(0);
  726. if (!$node instanceof \DOMElement) {
  727. throw new \InvalidArgumentException(sprintf('The selected node should be instance of DOMElement, got "%s".', \get_class($node)));
  728. }
  729. return new Image($node, $this->baseHref);
  730. }
  731. /**
  732. * Returns an array of Image objects for the nodes in the list.
  733. *
  734. * @return Image[] An array of Image instances
  735. */
  736. public function images()
  737. {
  738. $images = [];
  739. foreach ($this as $node) {
  740. if (!$node instanceof \DOMElement) {
  741. throw new \InvalidArgumentException(sprintf('The current node list should contain only DOMElement instances, "%s" found.', \get_class($node)));
  742. }
  743. $images[] = new Image($node, $this->baseHref);
  744. }
  745. return $images;
  746. }
  747. /**
  748. * Returns a Form object for the first node in the list.
  749. *
  750. * @return Form A Form instance
  751. *
  752. * @throws \InvalidArgumentException If the current node list is empty or the selected node is not instance of DOMElement
  753. */
  754. public function form(array $values = null, string $method = null)
  755. {
  756. if (!$this->nodes) {
  757. throw new \InvalidArgumentException('The current node list is empty.');
  758. }
  759. $node = $this->getNode(0);
  760. if (!$node instanceof \DOMElement) {
  761. throw new \InvalidArgumentException(sprintf('The selected node should be instance of DOMElement, got "%s".', \get_class($node)));
  762. }
  763. $form = new Form($node, $this->uri, $method, $this->baseHref);
  764. if (null !== $values) {
  765. $form->setValues($values);
  766. }
  767. return $form;
  768. }
  769. /**
  770. * Overloads a default namespace prefix to be used with XPath and CSS expressions.
  771. */
  772. public function setDefaultNamespacePrefix(string $prefix)
  773. {
  774. $this->defaultNamespacePrefix = $prefix;
  775. }
  776. public function registerNamespace(string $prefix, string $namespace)
  777. {
  778. $this->namespaces[$prefix] = $namespace;
  779. }
  780. /**
  781. * Converts string for XPath expressions.
  782. *
  783. * Escaped characters are: quotes (") and apostrophe (').
  784. *
  785. * Examples:
  786. *
  787. * echo Crawler::xpathLiteral('foo " bar');
  788. * //prints 'foo " bar'
  789. *
  790. * echo Crawler::xpathLiteral("foo ' bar");
  791. * //prints "foo ' bar"
  792. *
  793. * echo Crawler::xpathLiteral('a\'b"c');
  794. * //prints concat('a', "'", 'b"c')
  795. *
  796. * @return string Converted string
  797. */
  798. public static function xpathLiteral(string $s)
  799. {
  800. if (false === strpos($s, "'")) {
  801. return sprintf("'%s'", $s);
  802. }
  803. if (false === strpos($s, '"')) {
  804. return sprintf('"%s"', $s);
  805. }
  806. $string = $s;
  807. $parts = [];
  808. while (true) {
  809. if (false !== $pos = strpos($string, "'")) {
  810. $parts[] = sprintf("'%s'", substr($string, 0, $pos));
  811. $parts[] = "\"'\"";
  812. $string = substr($string, $pos + 1);
  813. } else {
  814. $parts[] = "'$string'";
  815. break;
  816. }
  817. }
  818. return sprintf('concat(%s)', implode(', ', $parts));
  819. }
  820. /**
  821. * Filters the list of nodes with an XPath expression.
  822. *
  823. * The XPath expression should already be processed to apply it in the context of each node.
  824. *
  825. * @return static
  826. */
  827. private function filterRelativeXPath(string $xpath): object
  828. {
  829. $prefixes = $this->findNamespacePrefixes($xpath);
  830. $crawler = $this->createSubCrawler(null);
  831. foreach ($this->nodes as $node) {
  832. $domxpath = $this->createDOMXPath($node->ownerDocument, $prefixes);
  833. $crawler->add($domxpath->query($xpath, $node));
  834. }
  835. return $crawler;
  836. }
  837. /**
  838. * Make the XPath relative to the current context.
  839. *
  840. * The returned XPath will match elements matching the XPath inside the current crawler
  841. * when running in the context of a node of the crawler.
  842. */
  843. private function relativize(string $xpath): string
  844. {
  845. $expressions = [];
  846. // An expression which will never match to replace expressions which cannot match in the crawler
  847. // We cannot drop
  848. $nonMatchingExpression = 'a[name() = "b"]';
  849. $xpathLen = \strlen($xpath);
  850. $openedBrackets = 0;
  851. $startPosition = strspn($xpath, " \t\n\r\0\x0B");
  852. for ($i = $startPosition; $i <= $xpathLen; ++$i) {
  853. $i += strcspn($xpath, '"\'[]|', $i);
  854. if ($i < $xpathLen) {
  855. switch ($xpath[$i]) {
  856. case '"':
  857. case "'":
  858. if (false === $i = strpos($xpath, $xpath[$i], $i + 1)) {
  859. return $xpath; // The XPath expression is invalid
  860. }
  861. continue 2;
  862. case '[':
  863. ++$openedBrackets;
  864. continue 2;
  865. case ']':
  866. --$openedBrackets;
  867. continue 2;
  868. }
  869. }
  870. if ($openedBrackets) {
  871. continue;
  872. }
  873. if ($startPosition < $xpathLen && '(' === $xpath[$startPosition]) {
  874. // If the union is inside some braces, we need to preserve the opening braces and apply
  875. // the change only inside it.
  876. $j = 1 + strspn($xpath, "( \t\n\r\0\x0B", $startPosition + 1);
  877. $parenthesis = substr($xpath, $startPosition, $j);
  878. $startPosition += $j;
  879. } else {
  880. $parenthesis = '';
  881. }
  882. $expression = rtrim(substr($xpath, $startPosition, $i - $startPosition));
  883. if (0 === strpos($expression, 'self::*/')) {
  884. $expression = './'.substr($expression, 8);
  885. }
  886. // add prefix before absolute element selector
  887. if ('' === $expression) {
  888. $expression = $nonMatchingExpression;
  889. } elseif (0 === strpos($expression, '//')) {
  890. $expression = 'descendant-or-self::'.substr($expression, 2);
  891. } elseif (0 === strpos($expression, './/')) {
  892. $expression = 'descendant-or-self::'.substr($expression, 3);
  893. } elseif (0 === strpos($expression, './')) {
  894. $expression = 'self::'.substr($expression, 2);
  895. } elseif (0 === strpos($expression, 'child::')) {
  896. $expression = 'self::'.substr($expression, 7);
  897. } elseif ('/' === $expression[0] || '.' === $expression[0] || 0 === strpos($expression, 'self::')) {
  898. $expression = $nonMatchingExpression;
  899. } elseif (0 === strpos($expression, 'descendant::')) {
  900. $expression = 'descendant-or-self::'.substr($expression, 12);
  901. } elseif (preg_match('/^(ancestor|ancestor-or-self|attribute|following|following-sibling|namespace|parent|preceding|preceding-sibling)::/', $expression)) {
  902. // the fake root has no parent, preceding or following nodes and also no attributes (even no namespace attributes)
  903. $expression = $nonMatchingExpression;
  904. } elseif (0 !== strpos($expression, 'descendant-or-self::')) {
  905. $expression = 'self::'.$expression;
  906. }
  907. $expressions[] = $parenthesis.$expression;
  908. if ($i === $xpathLen) {
  909. return implode(' | ', $expressions);
  910. }
  911. $i += strspn($xpath, " \t\n\r\0\x0B", $i + 1);
  912. $startPosition = $i + 1;
  913. }
  914. return $xpath; // The XPath expression is invalid
  915. }
  916. /**
  917. * @return \DOMNode|null
  918. */
  919. public function getNode(int $position)
  920. {
  921. return isset($this->nodes[$position]) ? $this->nodes[$position] : null;
  922. }
  923. /**
  924. * @return int
  925. */
  926. public function count()
  927. {
  928. return \count($this->nodes);
  929. }
  930. /**
  931. * @return \ArrayIterator|\DOMNode[]
  932. */
  933. public function getIterator()
  934. {
  935. return new \ArrayIterator($this->nodes);
  936. }
  937. /**
  938. * @param \DOMElement $node
  939. *
  940. * @return array
  941. */
  942. protected function sibling($node, string $siblingDir = 'nextSibling')
  943. {
  944. $nodes = [];
  945. $currentNode = $this->getNode(0);
  946. do {
  947. if ($node !== $currentNode && XML_ELEMENT_NODE === $node->nodeType) {
  948. $nodes[] = $node;
  949. }
  950. } while ($node = $node->$siblingDir);
  951. return $nodes;
  952. }
  953. private function parseHtml5(string $htmlContent, string $charset = 'UTF-8'): \DOMDocument
  954. {
  955. return $this->html5Parser->parse($this->convertToHtmlEntities($htmlContent, $charset), [], $charset);
  956. }
  957. private function parseXhtml(string $htmlContent, string $charset = 'UTF-8'): \DOMDocument
  958. {
  959. $htmlContent = $this->convertToHtmlEntities($htmlContent, $charset);
  960. $internalErrors = libxml_use_internal_errors(true);
  961. $disableEntities = libxml_disable_entity_loader(true);
  962. $dom = new \DOMDocument('1.0', $charset);
  963. $dom->validateOnParse = true;
  964. if ('' !== trim($htmlContent)) {
  965. @$dom->loadHTML($htmlContent);
  966. }
  967. libxml_use_internal_errors($internalErrors);
  968. libxml_disable_entity_loader($disableEntities);
  969. return $dom;
  970. }
  971. /**
  972. * Converts charset to HTML-entities to ensure valid parsing.
  973. */
  974. private function convertToHtmlEntities(string $htmlContent, string $charset = 'UTF-8'): string
  975. {
  976. set_error_handler(function () { throw new \Exception(); });
  977. try {
  978. return mb_convert_encoding($htmlContent, 'HTML-ENTITIES', $charset);
  979. } catch (\Exception $e) {
  980. try {
  981. $htmlContent = iconv($charset, 'UTF-8', $htmlContent);
  982. $htmlContent = mb_convert_encoding($htmlContent, 'HTML-ENTITIES', 'UTF-8');
  983. } catch (\Exception $e) {
  984. }
  985. return $htmlContent;
  986. } finally {
  987. restore_error_handler();
  988. }
  989. }
  990. /**
  991. * @throws \InvalidArgumentException
  992. */
  993. private function createDOMXPath(\DOMDocument $document, array $prefixes = []): \DOMXPath
  994. {
  995. $domxpath = new \DOMXPath($document);
  996. foreach ($prefixes as $prefix) {
  997. $namespace = $this->discoverNamespace($domxpath, $prefix);
  998. if (null !== $namespace) {
  999. $domxpath->registerNamespace($prefix, $namespace);
  1000. }
  1001. }
  1002. return $domxpath;
  1003. }
  1004. /**
  1005. * @throws \InvalidArgumentException
  1006. */
  1007. private function discoverNamespace(\DOMXPath $domxpath, string $prefix): ?string
  1008. {
  1009. if (isset($this->namespaces[$prefix])) {
  1010. return $this->namespaces[$prefix];
  1011. }
  1012. // ask for one namespace, otherwise we'd get a collection with an item for each node
  1013. $namespaces = $domxpath->query(sprintf('(//namespace::*[name()="%s"])[last()]', $this->defaultNamespacePrefix === $prefix ? '' : $prefix));
  1014. return ($node = $namespaces->item(0)) ? $node->nodeValue : null;
  1015. }
  1016. private function findNamespacePrefixes(string $xpath): array
  1017. {
  1018. if (preg_match_all('/(?P<prefix>[a-z_][a-z_0-9\-\.]*+):[^"\/:]/i', $xpath, $matches)) {
  1019. return array_unique($matches['prefix']);
  1020. }
  1021. return [];
  1022. }
  1023. /**
  1024. * Creates a crawler for some subnodes.
  1025. *
  1026. * @param \DOMNodeList|\DOMNode|\DOMNode[]|string|null $nodes
  1027. *
  1028. * @return static
  1029. */
  1030. private function createSubCrawler($nodes): object
  1031. {
  1032. $crawler = new static($nodes, $this->uri, $this->baseHref);
  1033. $crawler->isHtml = $this->isHtml;
  1034. $crawler->document = $this->document;
  1035. $crawler->namespaces = $this->namespaces;
  1036. $crawler->html5Parser = $this->html5Parser;
  1037. return $crawler;
  1038. }
  1039. /**
  1040. * @throws \LogicException If the CssSelector Component is not available
  1041. */
  1042. private function createCssSelectorConverter(): CssSelectorConverter
  1043. {
  1044. if (!class_exists(CssSelectorConverter::class)) {
  1045. throw new \LogicException('To filter with a CSS selector, install the CssSelector component ("composer require symfony/css-selector"). Or use filterXpath instead.');
  1046. }
  1047. return new CssSelectorConverter($this->isHtml);
  1048. }
  1049. }