-
Notifications
You must be signed in to change notification settings - Fork 0
/
class-wp-xml-tag-processor.php
2801 lines (2549 loc) · 85.1 KB
/
class-wp-xml-tag-processor.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
<?php
/**
* XML API: WP_XML_Tag_Processor class
*
* Scans through an XML document to find specific tags, then
* transforms those tags by adding, removing, or updating the
* values of the XML attributes within that tag (opener).
*
* It implements a subset of the XML 1.0 specification (https://www.w3.org/TR/xml/)
* and supports XML documents with the following characteristics:
*
* * XML 1.0
* * Well-formed
* * UTF-8 encoded
* * Not standalone (so can use external entities)
* * No DTD, DOCTYPE, ATTLIST, ENTITY, or conditional sections
*
* ### Possible future direction for this module
*
* The final goal is to support both 1.0 and 1.1 depending on the
* initial processing instruction (<?xml version="1.0" ?>). We're
* starting with 1.0, however, because most that's what most WXR
* files declare.
*
* ## Future work
*
* @TODO: Skip over the following syntax elements:
* * <!DOCTYPE, see https://www.w3.org/TR/xml/#sec-prolog-dtd
* * <!ATTLIST, see https://www.w3.org/TR/xml/#attdecls
* * <!ENTITY, see https://www.w3.org/TR/xml/#sec-entity-decl
* * <!NOTATION, see https://www.w3.org/TR/xml/#sec-entity-decl
* * Conditional sections, see https://www.w3.org/TR/xml/#sec-condition-sect
*
* @TODO Explore declaring elements as PCdata directly in the XML document,
* for example as follows:
*
* <!ELEMENT p (#PCDATA|emph)* >
*
* or
*
* <!DOCTYPE test [
* <!ELEMENT test (#PCDATA) >
* <!ENTITY % xx '%zz;'>
* <!ENTITY % zz '<!ENTITY tricky "error-prone" >' >
* %xx;
* ]>
*
* @TODO: Support XML 1.1.
* @package WordPress
* @subpackage HTML-API
* @since WP_VERSION
*/
/**
* Core class used to modify attributes in an XML document for tags matching a query.
*
* ## Usage
*
* Use of this class requires three steps:
*
* 1. Create a new class instance with your input XML document.
* 2. Find the tag(s) you are looking for.
* 3. Request changes to the attributes in those tag(s).
*
* Example:
*
* $tags = new WP_XML_Tag_Processor( $xml );
* if ( $tags->next_tag( 'wp:option' ) ) {
* $tags->set_attribute( 'selected', 'yes' );
* }
*
* ### Finding tags
*
* The `next_tag()` function moves the internal cursor through
* your input XML document until it finds a tag meeting any of
* the supplied restrictions in the optional query argument. If
* no argument is provided then it will find the next XML tag,
* regardless of what kind it is.
*
* If you want to _find whatever the next tag is_:
*
* $tags->next_tag();
*
* | Goal | Query |
* |-----------------------------------------------------------|---------------------------------------------------------------------------------|
* | Find any tag. | `$tags->next_tag();` |
* | Find next image tag. | `$tags->next_tag( array( 'tag_name' => 'wp:image' ) );` |
* | Find next image tag (without passing the array). | `$tags->next_tag( 'wp:image' );` |
*
* If a tag was found meeting your criteria then `next_tag()`
* will return `true` and you can proceed to modify it. If it
* returns `false`, however, it failed to find the tag and
* moved the cursor to the end of the file.
*
* Once the cursor reaches the end of the file the processor
* is done and if you want to reach an earlier tag you will
* need to recreate the processor and start over, as it's
* unable to back up or move in reverse.
*
* See the section on bookmarks for an exception to this
* no-backing-up rule.
*
* #### Custom queries
*
* Sometimes it's necessary to further inspect an XML tag than
* the query syntax here permits. In these cases one may further
* inspect the search results using the read-only functions
* provided by the processor or external state or variables.
*
* Example:
*
* // Paint up to the first five `wp:musician` or `wp:actor` tags marked with the "jazzy" style.
* $remaining_count = 5;
* while ( $remaining_count > 0 && $tags->next_tag() ) {
* if (
* ( 'wp:musician' === $tags->get_tag() || 'wp:actor' === $tags->get_tag() ) &&
* 'jazzy' === $tags->get_attribute( 'data-style' )
* ) {
* $tags->set_attribute( 'wp:theme-style', 'theme-style-everest-jazz' );
* $remaining_count--;
* }
* }
*
* `get_attribute()` will return `null` if the attribute wasn't present
* on the tag when it was called. It may return `""` (the empty string)
* in cases where the attribute was present but its value was empty.
* For boolean attributes, those whose name is present but no value is
* given, it will return `true` (the only way to set `false` for an
* attribute is to remove it).
*
* #### When matching fails
*
* When `next_tag()` returns `false` it could mean different things:
*
* - The requested tag wasn't found in the input document.
* - The input document ended in the middle of an XML syntax element.
*
* When a document ends in the middle of a syntax element it will pause
* the processor. This is to make it possible in the future to extend the
* input document and proceed - an important requirement for chunked
* streaming parsing of a document.
*
* Example:
*
* $processor = new WP_XML_Tag_Processor( 'This <wp:content is="a" partial="token' );
* false === $processor->next_tag();
*
* If a special element (see next section) is encountered but no closing tag
* is found it will count as an incomplete tag. The parser will pause as if
* the opening tag were incomplete.
*
* Example:
*
* $processor = new WP_XML_Tag_Processor( '<style>// there could be more styling to come' );
* false === $processor->next_tag();
*
* $processor = new WP_XML_Tag_Processor( '<style>// this is everything</style><wp:content>' );
* true === $processor->next_tag( 'DIV' );
*
* #### Special elements
*
* All XML elements are handled in the same way, except when you mark
* them as PCdata elements. These are special because their contents
* is treated as text, even if it looks like XML tags.
*
* Example:
*
* $processor = new WP_XML_Tag_Processor( '<root><wp:post-content>Text inside</input></wp:post-content><</root>' );
* $processor->declare_element_as_pcdata('wp:post-content');
* $processor->next_tag('wp:post-content');
* $processor->next_token();
* echo $processor->get_modifiable_text(); // Text inside</input>
*
* ### Modifying XML attributes for a found tag
*
* Once you've found the start of an opening tag you can modify
* any number of the attributes on that tag. You can set a new
* value for an attribute, remove the entire attribute, or do
* nothing and move on to the next opening tag.
*
* Example:
*
* if ( $tags->next_tag( 'wp:user-group' ) ) {
* $tags->set_attribute( 'name', 'Content editors' );
* $tags->remove_attribute( 'data-test-id' );
* }
*
* If `set_attribute()` is called for an existing attribute it will
* overwrite the existing value. Similarly, calling `remove_attribute()`
* for a non-existing attribute has no effect on the document. Both
* of these methods are safe to call without knowing if a given attribute
* exists beforehand.
*
* ### Bookmarks
*
* While scanning through the input XML document it's possible to set
* a named bookmark when a particular tag is found. Later on, after
* continuing to scan other tags, it's possible to `seek` to one of
* the set bookmarks and then proceed again from that point forward.
*
* Because bookmarks create processing overhead one should avoid
* creating too many of them. As a rule, create only bookmarks
* of known string literal names; avoid creating "mark_{$index}"
* and so on. It's fine from a performance standpoint to create a
* bookmark and update it frequently, such as within a loop.
*
* $total_todos = 0;
* while ( $p->next_tag( array( 'tag_name' => 'wp:todo-list' ) ) ) {
* $p->set_bookmark( 'list-start' );
* while ( $p->next_tag( array( 'tag_closers' => 'visit' ) ) ) {
* if ( 'wp:todo' === $p->get_tag() && $p->is_tag_closer() ) {
* $p->set_bookmark( 'list-end' );
* $p->seek( 'list-start' );
* $p->set_attribute( 'data-contained-todos', (string) $total_todos );
* $total_todos = 0;
* $p->seek( 'list-end' );
* break;
* }
*
* if ( 'wp:todo-item' === $p->get_tag() && ! $p->is_tag_closer() ) {
* $total_todos++;
* }
* }
* }
*
* ## Tokens and finer-grained processing.
*
* It's possible to scan through every lexical token in the
* XML document using the `next_token()` function. This
* alternative form takes no argument and provides no built-in
* query syntax.
*
* Example:
*
* $title = '(untitled)';
* $text = '';
* while ( $processor->next_token() ) {
* switch ( $processor->get_token_name() ) {
* case '#text':
* $text .= $processor->get_modifiable_text();
* break;
*
* case 'wp:new-line':
* $text .= "\n";
* break;
*
* case 'wp:title':
* $title = $processor->get_modifiable_text();
* break;
* }
* }
* return trim( "# {$title}\n\n{$text}" );
*
* ### Tokens and _modifiable text_.
*
* #### Other tokens with modifiable text.
*
* There are also non-elements which are void/self-closing in nature and contain
* modifiable text that is part of that individual syntax token itself.
*
* - `#text` nodes, whose entire token _is_ the modifiable text.
* - XML comments and tokens that become comments due to some syntax error. The
* text for these tokens is the portion of the comment inside of the syntax.
* E.g. for `<!-- comment -->` the text is `" comment "` (note the spaces are included).
* - `CDATA` sections, whose text is the content inside of the section itself. E.g. for
* `<![CDATA[some content]]>` the text is `"some content"`.
* - XML Processing instruction nodes like `<?xml __( "Like" ); ?>` (with restrictions [1]).
*
* [1]: XML requires "xml" as a processing instruction name. The Tag Processor captures the entire
* processing instruction as a single token up to the closing `?>`.
*
* ## Design and limitations
*
* The Tag Processor is designed to linearly scan XML documents and tokenize
* XML tags and their attributes. It's designed to do this as efficiently as
* possible without compromising parsing integrity. Therefore it will be
* slower than some methods of modifying XML, such as those incorporating
* over-simplified PCRE patterns, but will not introduce the defects and
* failures that those methods bring in, which lead to broken page renders
* and often to security vulnerabilities. On the other hand, it will be faster
* than full-blown XML parsers such as DOMDocument and use considerably
* less memory. It requires a negligible memory overhead, enough to consider
* it a zero-overhead system.
*
* The performance characteristics are maintained by avoiding tree construction.
*
* The Tag Processor's checks the most important aspects of XML integrity as it scans
* through the document. It verifies that a single root element exists, that are
* no unclosed tags, and that each opener tag has a corresponding closer. It also
* ensures no duplicate attributes exist on a single tag.
*
* At the same time, The Tag Processor also skips expensive validation of XML entities
* in the document. The Tag Processor will initially pass through the invalid entity references
* and only fail when the developer attempts to read their value. If that doesn't happen,
* the invalid values will be left untouched in the final document.
*
* Most operations within the Tag Processor are designed to minimize the difference
* between an input and output document for any given change. For example, the
* `set_attribure` and `remove_attribute` methods preserve whitespace and the attribute
* ordering within the element definition. An exception to this rule is that all attribute
* updates store their values as double-quoted strings, meaning that attributes on input with
* single-quoted or unquoted values will appear in the output with double-quotes.
*
* ### Text Encoding
*
* The Tag Processor assumes that the input XML document is encoded with a
* UTF-8 encoding and will refuse to process documents that declare other encodings.
*
* @since WP_VERSION
*/
class WP_XML_Tag_Processor {
/**
* The maximum number of bookmarks allowed to exist at
* any given time.
*
* @since WP_VERSION
* @var int
*
* @see WP_XML_Tag_Processor::set_bookmark()
*/
const MAX_BOOKMARKS = 10;
/**
* Maximum number of times seek() can be called.
* Prevents accidental infinite loops.
*
* @since WP_VERSION
* @var int
*
* @see WP_XML_Tag_Processor::seek()
*/
const MAX_SEEK_OPS = 1000;
/**
* The XML document to parse.
*
* @since WP_VERSION
* @var string
*/
protected $xml;
/**
* The last query passed to next_tag().
*
* @since WP_VERSION
* @var array|null
*/
private $last_query;
/**
* The tag name this processor currently scans for.
*
* @since WP_VERSION
* @var string|null
*/
private $sought_tag_name;
/**
* The match offset this processor currently scans for.
*
* @since WP_VERSION
* @var int|null
*/
private $sought_match_offset;
/**
* Whether to visit tag closers, e.g. </wp:content>, when walking an input document.
*
* @since WP_VERSION
* @var bool
*/
private $stop_on_tag_closers;
/**
* Specifies mode of operation of the parser at any given time.
*
* | State | Meaning |
* | ----------------|------------------------------------------------------------------------|
* | *Ready* | The parser is ready to run. |
* | *Complete* | There is nothing left to parse. |
* | *Incomplete* | The XML ended in the middle of a token; nothing more can be parsed. |
* | *Matched tag* | Found an XML tag; it's possible to modify its attributes. |
* | *Text node* | Found a #text node; this is plaintext and modifiable. |
* | *CDATA node* | Found a CDATA section; this is modifiable. |
* | *PI node* | Found a processing instruction; this is modifiable. |
* | *XML declaration* | Found an XML declaration; this is modifiable. |
* | *Comment* | Found a comment or bogus comment; this is modifiable. |
*
* @since WP_VERSION
*
* @see WP_XML_Tag_Processor::STATE_READY
* @see WP_XML_Tag_Processor::STATE_COMPLETE
* @see WP_XML_Tag_Processor::STATE_INCOMPLETE_INPUT
* @see WP_XML_Tag_Processor::STATE_MATCHED_TAG
* @see WP_XML_Tag_Processor::STATE_TEXT_NODE
* @see WP_XML_Tag_Processor::STATE_CDATA_NODE
* @see WP_XML_Tag_Processor::STATE_PI_NODE
* @see WP_XML_Tag_Processor::STATE_XML_DECLARATION
* @see WP_XML_Tag_Processor::STATE_COMMENT
*
* @var string
*/
protected $parser_state = self::STATE_READY;
/**
* Whether we stopped at an incomplete text node.
*
* If we are before the last tag in the document, every text
* node is incomplete until we find the next tag. However,
* if we are after the last tag, an incomplete all-whitespace
* node may either mean we're the end of the document or
* that we're still waiting for more data/
*
* This flag allows us to differentiate between these two
* cases in context-aware APIs such as WP_XML_Processor.
*
* @var bool
*/
protected $is_incomplete_text_node = false;
/**
* How many bytes from the original XML document have been read and parsed.
*
* This value points to the latest byte offset in the input document which
* has been already parsed. It is the internal cursor for the Tag Processor
* and updates while scanning through the XML tokens.
*
* @since WP_VERSION
* @var int
*/
private $bytes_already_parsed = 0;
/**
* Byte offset in input document where current token starts.
*
* Example:
*
* <wp:content id="test">...
* 01234
* - token starts at 0
*
* @since WP_VERSION
*
* @var int|null
*/
private $token_starts_at;
/**
* Byte length of current token.
*
* Example:
*
* <wp:content id="test">...
* 012345678901234
* - token length is 14 - 0 = 14
*
* a <!-- comment --> is a token.
* 0123456789 123456789 123456789
* - token length is 17 - 2 = 15
*
* @since WP_VERSION
*
* @var int|null
*/
private $token_length;
/**
* Byte offset in input document where current tag name starts.
*
* Example:
*
* <wp:content id="test">...
* 01234
* - tag name starts at 1
*
* @since WP_VERSION
*
* @var int|null
*/
private $tag_name_starts_at;
/**
* Byte length of current tag name.
*
* Example:
*
* <wp:content id="test">...
* 01234
* --- tag name length is 3
*
* @since WP_VERSION
*
* @var int|null
*/
private $tag_name_length;
/**
* Byte offset into input document where current modifiable text starts.
*
* @since WP_VERSION
*
* @var int
*/
private $text_starts_at;
/**
* Byte length of modifiable text.
*
* @since WP_VERSION
*
* @var string
*/
private $text_length;
/**
* Whether the current tag is an opening tag, e.g. <wp:content>, or a closing tag, e.g. </wp:content>.
*
* @var bool
*/
private $is_closing_tag;
/**
* Stores an explanation for why something failed, if it did.
*
* @see self::get_last_error
*
* @since WP_VERSION
*
* @var string|null
*/
protected $last_error = null;
/**
* Lazily-built index of attributes found within an XML tag, keyed by the attribute name.
*
* Example:
*
* // Supposing the parser is working through this content
* // and stops after recognizing the `id` attribute.
* // <wp:content id="test-4" class=outline title="data:text/plain;base64=asdk3nk1j3fo8">
* // ^ parsing will continue from this point.
* $this->attributes = array(
* 'id' => new WP_HTML_Attribute_Token( 'id', 9, 6, 5, 11, false )
* );
*
* // When picking up parsing again, or when asking to find the
* // `class` attribute we will continue and add to this array.
* $this->attributes = array(
* 'id' => new WP_HTML_Attribute_Token( 'id', 9, 6, 5, 11, false ),
* 'class' => new WP_HTML_Attribute_Token( 'class', 23, 7, 17, 13, false )
* );
*
* @since WP_VERSION
* @var WP_HTML_Attribute_Token[]
*/
private $attributes = array();
/**
* Tracks a semantic location in the original XML which
* shifts with updates as they are applied to the document.
*
* @since WP_VERSION
* @var WP_HTML_Span[]
*/
protected $bookmarks = array();
/**
* Lexical replacements to apply to input XML document.
*
* "Lexical" in this class refers to the part of this class which
* operates on pure text _as text_ and not as XML. There's a line
* between the public interface, with XML-semantic methods like
* `set_attribute` and `add_class`, and an internal state that tracks
* text offsets in the input document.
*
* When higher-level XML methods are called, those have to transform their
* operations (such as setting an attribute's value) into text diffing
* operations (such as replacing the sub-string from indices A to B with
* some given new string). These text-diffing operations are the lexical
* updates.
*
* As new higher-level methods are added they need to collapse their
* operations into these lower-level lexical updates since that's the
* Tag Processor's internal language of change. Any code which creates
* these lexical updates must ensure that they do not cross XML syntax
* boundaries, however, so these should never be exposed outside of this
* class or any classes which intentionally expand its functionality.
*
* These are enqueued while editing the document instead of being immediately
* applied to avoid processing overhead, string allocations, and string
* copies when applying many updates to a single document.
*
* Example:
*
* // Replace an attribute stored with a new value, indices
* // sourced from the lazily-parsed XML recognizer.
* $start = $attributes['src']->start;
* $length = $attributes['src']->length;
* $modifications[] = new WP_HTML_Text_Replacement( $start, $length, $new_value );
*
* // Correspondingly, something like this will appear in this array.
* $lexical_updates = array(
* WP_HTML_Text_Replacement( 14, 28, 'https://my-site.my-domain/wp-content/uploads/2014/08/kittens.jpg' )
* );
*
* @since WP_VERSION
* @var WP_HTML_Text_Replacement[]
*/
protected $lexical_updates = array();
/**
* Tracks and limits `seek()` calls to prevent accidental infinite loops.
*
* @since WP_VERSION
* @var int
*
* @see WP_XML_Tag_Processor::seek()
*/
protected $seek_count = 0;
public $had_previous_chunks = false;
/**
* Constructor.
*
* @since WP_VERSION
*
* @param string $xml XML to process.
*/
public function __construct( $xml ) {
$this->xml = $xml;
}
/**
* Finds the next element matching the $query.
*
* This doesn't currently have a way to represent non-tags and doesn't process
* semantic rules for text nodes.
*
* @since WP_VERSION
*
* @param array|string|null $query {
* Optional. Which element name to find. Default is to find any tag.
*
* @type string|null $tag_name Which tag to find, or `null` for "any tag."
* @type int|null $match_offset Find the Nth tag matching all search criteria.
* 1 for "first" tag, 3 for "third," etc.
* Defaults to first tag.
* @type string|null $tag_closers "visit" or "skip": whether to stop on tag closers, e.g. </wp:content>.
* }
* @return bool Whether a tag was matched.
*/
public function next_tag( $query = null ) {
$this->parse_query( $query );
$already_found = 0;
do {
if ( false === $this->base_class_next_token() ) {
return false;
}
if ( self::STATE_MATCHED_TAG !== $this->parser_state ) {
continue;
}
if ( $this->matches() ) {
++$already_found;
}
} while ( $already_found < $this->sought_match_offset );
return true;
}
/**
* Finds the next token in the XML document.
*
* An XML document can be viewed as a stream of tokens,
* where tokens are things like XML tags, XML comments,
* text nodes, etc. This method finds the next token in
* the XML document and returns whether it found one.
*
* If it starts parsing a token and reaches the end of the
* document then it will seek to the start of the last
* token and pause, returning `false` to indicate that it
* failed to find a complete token.
*
* Possible token types, based on the XML specification:
*
* - an XML tag, whether opening, closing, or void.
* - a text node - the plaintext inside tags.
* - an XML comment.
* - a processing instruction, e.g. `<?xml version="1.0" ?>`.
*
* The Tag Processor currently only supports the tag token.
*
* @since WP_VERSION
*
* @access private
*
* @return bool Whether a token was parsed.
*/
public function next_token() {
return $this->base_class_next_token();
}
/**
* Internal method which finds the next token in the HTML document.
*
* This method is a protected internal function which implements the logic for
* finding the next token in a document. It exists so that the parser can update
* its state without affecting the location of the cursor in the document and
* without triggering subclass methods for things like `next_token()`, e.g. when
* applying patches before searching for the next token.
*
* @since 6.5.0
*
* @access private
*
* @return bool Whether a token was parsed.
*/
protected function base_class_next_token() {
$was_at = $this->bytes_already_parsed;
$this->after_tag();
// Don't proceed if there's nothing more to scan.
if (
self::STATE_COMPLETE === $this->parser_state ||
self::STATE_INCOMPLETE_INPUT === $this->parser_state ||
null !== $this->last_error
) {
return false;
}
/*
* The next step in the parsing loop determines the parsing state;
* clear it so that state doesn't linger from the previous step.
*/
$this->parser_state = self::STATE_READY;
if ( $this->bytes_already_parsed >= strlen( $this->xml ) ) {
$this->parser_state = self::STATE_COMPLETE;
return false;
}
// Find the next tag if it exists.
if ( false === $this->parse_next_tag() ) {
if ( self::STATE_INCOMPLETE_INPUT === $this->parser_state ) {
$this->bytes_already_parsed = $was_at;
}
return false;
}
if ( null !== $this->last_error ) {
return false;
}
/*
* For legacy reasons the rest of this function handles tags and their
* attributes. If the processor has reached the end of the document
* or if it matched any other token then it should return here to avoid
* attempting to process tag-specific syntax.
*/
if (
self::STATE_INCOMPLETE_INPUT !== $this->parser_state &&
self::STATE_COMPLETE !== $this->parser_state &&
self::STATE_MATCHED_TAG !== $this->parser_state
) {
return true;
}
if ( $this->is_closing_tag ) {
$this->skip_whitespace();
} else {
// Parse all of its attributes.
while ( $this->parse_next_attribute() ) {
continue;
}
}
if ( null !== $this->last_error ) {
return false;
}
// Ensure that the tag closes before the end of the document.
if (
self::STATE_INCOMPLETE_INPUT === $this->parser_state ||
$this->bytes_already_parsed >= strlen( $this->xml )
) {
// Does this appropriately clear state (parsed attributes)?
$this->parser_state = self::STATE_INCOMPLETE_INPUT;
$this->bytes_already_parsed = $was_at;
return false;
}
$tag_ends_at = strpos( $this->xml, '>', $this->bytes_already_parsed );
if ( false === $tag_ends_at ) {
$this->parser_state = self::STATE_INCOMPLETE_INPUT;
$this->bytes_already_parsed = $was_at;
return false;
}
if ( $this->is_closing_tag && $tag_ends_at !== $this->bytes_already_parsed ) {
$this->last_error = self::ERROR_SYNTAX;
_doing_it_wrong(
__METHOD__,
__( 'Invalid closing tag encountered.' ),
'WP_VERSION'
);
return false;
}
$this->parser_state = self::STATE_MATCHED_TAG;
$this->bytes_already_parsed = $tag_ends_at + 1;
$this->token_length = $this->bytes_already_parsed - $this->token_starts_at;
/*
* If we are in a PCData element, everything until the closer
* is considered text.
*/
if ( ! $this->is_pcdata_element() ) {
return true;
}
/*
* Preserve the opening tag pointers, as these will be overwritten
* when finding the closing tag. They will be reset after finding
* the closing to tag to point to the opening of the special atomic
* tag sequence.
*/
$tag_name_starts_at = $this->tag_name_starts_at;
$tag_name_length = $this->tag_name_length;
$tag_ends_at = $this->token_starts_at + $this->token_length;
$attributes = $this->attributes;
$found_closer = $this->skip_pcdata( $this->get_tag() );
// Closer not found, the document is incomplete.
if ( false === $found_closer ) {
$this->parser_state = self::STATE_INCOMPLETE_INPUT;
$this->bytes_already_parsed = $was_at;
return false;
}
/*
* The values here look like they reference the opening tag but they reference
* the closing tag instead. This is why the opening tag values were stored
* above in a variable. It reads confusingly here, but that's because the
* functions that skip the contents have moved all the internal cursors past
* the inner content of the tag.
*/
$this->token_starts_at = $was_at;
$this->token_length = $this->bytes_already_parsed - $this->token_starts_at;
$this->text_starts_at = $tag_ends_at;
$this->text_length = $this->tag_name_starts_at - $this->text_starts_at;
$this->tag_name_starts_at = $tag_name_starts_at;
$this->tag_name_length = $tag_name_length;
$this->attributes = $attributes;
return true;
}
/**
* Whether the processor paused because the input XML document ended
* in the middle of a syntax element, such as in the middle of a tag.
*
* Example:
*
* $processor = new WP_XML_Tag_Processor( '<input type="text" value="Th' );
* false === $processor->get_next_tag();
* true === $processor->paused_at_incomplete_token();
*
* @since WP_VERSION
*
* @return bool Whether the parse paused at the start of an incomplete token.
*/
public function paused_at_incomplete_token() {
return self::STATE_INCOMPLETE_INPUT === $this->parser_state;
}
/**
* Sets a bookmark in the XML document.
*
* Bookmarks represent specific places or tokens in the XML
* document, such as a tag opener or closer. When applying
* edits to a document, such as setting an attribute, the
* text offsets of that token may shift; the bookmark is
* kept updated with those shifts and remains stable unless
* the entire span of text in which the token sits is removed.
*
* Release bookmarks when they are no longer needed.
*
* Example:
*
* <main><h2>Surprising fact you may not know!</h2></main>
* ^ ^
* \-|-- this `H2` opener bookmark tracks the token
*
* <main class="clickbait"><h2>Surprising fact you may no…
* ^ ^
* \-|-- it shifts with edits
*
* Bookmarks provide the ability to seek to a previously-scanned
* place in the XML document. This avoids the need to re-scan
* the entire document.
*
* Example:
*
* <ul><li>One</li><li>Two</li><li>Three</li></ul>
* ^^^^
* want to note this last item
*
* $p = new WP_XML_Tag_Processor( $xml );
* $in_list = false;
* while ( $p->next_tag( array( 'tag_closers' => $in_list ? 'visit' : 'skip' ) ) ) {
* if ( 'UL' === $p->get_tag() ) {
* if ( $p->is_tag_closer() ) {
* $in_list = false;
* $p->set_bookmark( 'resume' );
* if ( $p->seek( 'last-li' ) ) {
* $p->add_class( 'last-li' );
* }
* $p->seek( 'resume' );
* $p->release_bookmark( 'last-li' );
* $p->release_bookmark( 'resume' );
* } else {
* $in_list = true;
* }
* }
*
* if ( 'LI' === $p->get_tag() ) {
* $p->set_bookmark( 'last-li' );
* }
* }
*
* Bookmarks intentionally hide the internal string offsets
* to which they refer. They are maintained internally as
* updates are applied to the XML document and therefore
* retain their "position" - the location to which they
* originally pointed. The inability to use bookmarks with
* functions like `substr` is therefore intentional to guard
* against accidentally breaking the XML.
*
* Because bookmarks allocate memory and require processing
* for every applied update, they are limited and require
* a name. They should not be created with programmatically-made
* names, such as "li_{$index}" with some loop. As a general
* rule they should only be created with string-literal names
* like "start-of-section" or "last-paragraph".
*
* Bookmarks are a powerful tool to enable complicated behavior.
* Consider double-checking that you need this tool if you are
* reaching for it, as inappropriate use could lead to broken
* XML structure or unwanted processing overhead.
*
* @since WP_VERSION
*
* @param string $name Identifies this particular bookmark.
* @return bool Whether the bookmark was successfully created.
*/
public function set_bookmark( $name ) {
// It only makes sense to set a bookmark if the parser has paused on a concrete token.
if (
self::STATE_COMPLETE === $this->parser_state ||
self::STATE_INCOMPLETE_INPUT === $this->parser_state
) {
return false;
}
if ( ! array_key_exists( $name, $this->bookmarks ) && count( $this->bookmarks ) >= static::MAX_BOOKMARKS ) {
_doing_it_wrong(
__METHOD__,
__( 'Too many bookmarks: cannot create any more.' ),
'WP_VERSION'
);
return false;
}
$this->bookmarks[ $name ] = new WP_HTML_Span( $this->token_starts_at, $this->token_length );
return true;
}
/**
* Removes a bookmark that is no longer needed.
*
* Releasing a bookmark frees up the small
* performance overhead it requires.
*
* @param string $name Name of the bookmark to remove.
* @return bool Whether the bookmark already existed before removal.
*/
public function release_bookmark( $name ) {
if ( ! array_key_exists( $name, $this->bookmarks ) ) {
return false;
}