@@ -809,149 +809,144 @@ static inline size_t write_octet_sequence(unsigned char *buf, enum entity_charse
809
809
/* +2 is 1 because of rest (probably unnecessary), 1 because of terminating 0 */
810
810
#define TRAVERSE_FOR_ENTITIES_EXPAND_SIZE (oldlen ) ((oldlen) + (oldlen) / 5 + 2)
811
811
static void traverse_for_entities (
812
- const char * input ,
813
- size_t input_len ,
812
+ const zend_string * input ,
814
813
zend_string * output , /* should have allocated TRAVERSE_FOR_ENTITIES_EXPAND_SIZE(olden) */
815
- int all ,
816
- int flags ,
814
+ const int all ,
815
+ const int flags ,
817
816
const entity_ht * inv_map ,
818
- enum entity_charset charset )
817
+ const enum entity_charset charset )
819
818
{
820
- const char * current_ptr = input ;
821
- const char * input_end = input + input_len ; /* terminator address */
822
- char * output_ptr = ZSTR_VAL (output );
823
- int doctype = flags & ENT_HTML_DOC_TYPE_MASK ;
824
-
825
- assert (* input_end == '\0' );
826
-
827
- while (current_ptr < input_end ) {
828
- const char * ampersand_ptr = memchr (current_ptr , '&' , input_end - current_ptr );
829
- if (!ampersand_ptr ) {
830
- size_t tail_len = input_end - current_ptr ;
831
- if (tail_len > 0 ) {
832
- memcpy (output_ptr , current_ptr , tail_len );
833
- output_ptr += tail_len ;
834
- }
835
- break ;
836
- }
837
-
838
- /* Copy everything up to the found '&' */
839
- size_t chunk_len = ampersand_ptr - current_ptr ;
840
- if (chunk_len > 0 ) {
841
- memcpy (output_ptr , current_ptr , chunk_len );
842
- output_ptr += chunk_len ;
843
- }
844
-
845
- /* Now current_ptr points to the '&' character. */
846
- current_ptr = ampersand_ptr ;
847
-
848
- /* If there are less than 4 bytes remaining, there isn't enough for an entity – copy '&' as a normal character */
849
- if (current_ptr + 3 >= input_end ) {
850
- * output_ptr ++ = * current_ptr ++ ;
851
- continue ;
852
- }
853
-
854
- unsigned code = 0 , code2 = 0 ;
855
- const char * entity_end_ptr = NULL ;
856
- int valid_entity = 1 ;
857
-
858
- if (current_ptr [1 ] == '#' ) {
859
- /* Processing numeric entity */
860
- const char * num_start = current_ptr + 2 ;
861
- entity_end_ptr = num_start ;
862
- if (process_numeric_entity (& entity_end_ptr , & code ) == FAILURE ) {
863
- valid_entity = 0 ;
864
- }
865
- /* If we're in htmlspecialchars_decode, we're only decoding entities
866
- * that represent &, <, >, " and '. Is this one of them? */
867
- if (valid_entity && !all &&
868
- (code > 63U ||
869
- stage3_table_be_apos_00000 [code ].data .ent .entity == NULL ))
870
- {
871
- valid_entity = 0 ;
872
- }
873
- /* are we allowed to decode this entity in this document type?
874
- * HTML 5 is the only that has a character that cannot be used in
875
- * a numeric entity but is allowed literally (U+000D). The
876
- * unoptimized version would be ... || !numeric_entity_is_allowed(code) */
877
- if (valid_entity && (!unicode_cp_is_allowed (code , doctype ) ||
878
- (doctype == ENT_HTML_DOC_HTML5 && code == 0x0D )))
879
- {
880
- valid_entity = 0 ;
881
- }
882
- } else {
883
- /* Processing named entity */
884
- const char * name_start = current_ptr + 1 ;
885
- /* Search for ';' */
886
- const char * semi_colon_ptr = memchr (name_start , ';' , LONGEST_ENTITY_LENGTH + 1 );
887
- if (!semi_colon_ptr ) {
888
- valid_entity = 0 ;
889
- } else {
890
- size_t name_len = semi_colon_ptr - name_start ;
891
- if (name_len == 0 ) {
892
- valid_entity = 0 ;
893
- } else {
894
- if (resolve_named_entity_html (name_start , name_len , inv_map , & code , & code2 ) == FAILURE ) {
895
- if (doctype == ENT_HTML_DOC_XHTML && name_len == 4 &&
896
- name_start [0 ] == 'a' && name_start [1 ] == 'p' &&
897
- name_start [2 ] == 'o' && name_start [3 ] == 's' )
898
- {
899
- /* uses html4 inv_map, which doesn't include apos;. This is a
819
+ const char * current_ptr = ZSTR_VAL (input );
820
+ const char * input_end = current_ptr + input -> len ; /* terminator address */
821
+ char * output_ptr = ZSTR_VAL (output );
822
+ const int doctype = flags & ENT_HTML_DOC_TYPE_MASK ;
823
+
824
+ while (current_ptr < input_end ) {
825
+ const char * ampersand_ptr = memchr (current_ptr , '&' , input_end - current_ptr );
826
+ if (!ampersand_ptr ) {
827
+ const size_t tail_len = input_end - current_ptr ;
828
+ if (tail_len > 0 ) {
829
+ memcpy (output_ptr , current_ptr , tail_len );
830
+ output_ptr += tail_len ;
831
+ }
832
+ break ;
833
+ }
834
+
835
+ /* Copy everything up to the found '&' */
836
+ const size_t chunk_len = ampersand_ptr - current_ptr ;
837
+ if (chunk_len > 0 ) {
838
+ memcpy (output_ptr , current_ptr , chunk_len );
839
+ output_ptr += chunk_len ;
840
+ }
841
+
842
+ /* Now current_ptr points to the '&' character. */
843
+ current_ptr = ampersand_ptr ;
844
+
845
+ /* If there are less than 4 bytes remaining, there isn't enough for an entity – copy '&' as a normal character */
846
+ if (input_end - current_ptr < 4 ){
847
+ const size_t remaining = input_end - current_ptr ;
848
+ memcpy (output_ptr , current_ptr , remaining );
849
+ output_ptr += remaining ;
850
+ break ;
851
+ }
852
+
853
+ unsigned code = 0 , code2 = 0 ;
854
+ const char * entity_end_ptr = NULL ;
855
+ bool valid_entity = true;
856
+
857
+ if (current_ptr [1 ] == '#' ) {
858
+ /* Processing numeric entity */
859
+ const char * num_start = current_ptr + 2 ;
860
+ entity_end_ptr = num_start ;
861
+ if (process_numeric_entity (& entity_end_ptr , & code ) == FAILURE ) {
862
+ valid_entity = false;
863
+ }
864
+ if (valid_entity && !all && (code > 63U || stage3_table_be_apos_00000 [code ].data .ent .entity == NULL )) {
865
+ /* If we're in htmlspecialchars_decode, we're only decoding entities
866
+ * that represent &, <, >, " and '. Is this one of them? */
867
+ valid_entity = false;
868
+ } else if (valid_entity && (!unicode_cp_is_allowed (code , doctype ) ||
869
+ (doctype == ENT_HTML_DOC_HTML5 && code == 0x0D ))) {
870
+ /* are we allowed to decode this entity in this document type?
871
+ * HTML 5 is the only that has a character that cannot be used in
872
+ * a numeric entity but is allowed literally (U+000D). The
873
+ * unoptimized version would be ... || !numeric_entity_is_allowed(code) */
874
+ valid_entity = false;
875
+ }
876
+ } else {
877
+ /* Processing named entity */
878
+ const char * name_start = current_ptr + 1 ;
879
+ /* Search for ';' */
880
+ const size_t max_search_len = MIN (LONGEST_ENTITY_LENGTH + 1 , input_end - name_start );
881
+ const char * semi_colon_ptr = memchr (name_start , ';' , max_search_len );
882
+ if (!semi_colon_ptr ) {
883
+ valid_entity = false;
884
+ } else {
885
+ const size_t name_len = semi_colon_ptr - name_start ;
886
+ if (name_len == 0 ) {
887
+ valid_entity = false;
888
+ } else {
889
+ if (resolve_named_entity_html (name_start , name_len , inv_map , & code , & code2 ) == FAILURE ) {
890
+ if (doctype == ENT_HTML_DOC_XHTML && name_len == 4 &&
891
+ name_start [0 ] == 'a' && name_start [1 ] == 'p' &&
892
+ name_start [2 ] == 'o' && name_start [3 ] == 's' )
893
+ {
894
+ /* uses html4 inv_map, which doesn't include apos;. This is a
900
895
* hack to support it */
901
- code = (unsigned )'\'' ;
902
- } else {
903
- valid_entity = 0 ;
904
- }
905
- }
906
- entity_end_ptr = semi_colon_ptr ;
907
- }
908
- }
909
- }
910
-
911
- /* If entity_end_ptr is not found or does not point to ';', consider the entity invalid */
912
- if (!valid_entity || entity_end_ptr == NULL || * entity_end_ptr != ';' ) {
913
- * output_ptr ++ = * current_ptr ++ ;
914
- continue ;
915
- }
916
-
917
- /* Check if quotes are allowed for entities representing ' or " */
918
- if ( ((code == '\'' && !(flags & ENT_HTML_QUOTE_SINGLE )) ||
919
- (code == '"' && !(flags & ENT_HTML_QUOTE_DOUBLE ) )))
920
- {
921
- valid_entity = 0 ;
922
- }
923
-
924
- /* UTF-8 doesn't need mapping (ISO-8859-1 doesn't either, but
896
+ code = (unsigned )'\'' ;
897
+ } else {
898
+ valid_entity = false ;
899
+ }
900
+ }
901
+ entity_end_ptr = semi_colon_ptr ;
902
+ }
903
+ }
904
+ }
905
+
906
+ /* If entity_end_ptr is not found or does not point to ';', consider the entity invalid */
907
+ if (!valid_entity || entity_end_ptr == NULL || * entity_end_ptr != ';' ) {
908
+ * output_ptr ++ = * current_ptr ++ ;
909
+ continue ;
910
+ }
911
+
912
+ /* Check if quotes are allowed for entities representing ' or " */
913
+ if ((code == '\'' && !(flags & ENT_HTML_QUOTE_SINGLE )) ||
914
+ (code == '"' && !(flags & ENT_HTML_QUOTE_DOUBLE )))
915
+ {
916
+ valid_entity = false ;
917
+ }
918
+
919
+ /* UTF-8 doesn't need mapping (ISO-8859-1 doesn't either, but
925
920
* the call is needed to ensure the codepoint <= U+00FF) */
926
- if (valid_entity && charset != cs_utf_8 ) {
927
- /* replace unicode code point */
928
- if (map_from_unicode (code , charset , & code ) == FAILURE || code2 != 0 )
929
- valid_entity = 0 ;
930
- }
931
-
932
- if (valid_entity ) {
933
- /* Write the parsed entity into the output buffer */
934
- output_ptr += write_octet_sequence ((unsigned char * )output_ptr , charset , code );
935
- if (code2 ) {
936
- output_ptr += write_octet_sequence ((unsigned char * )output_ptr , charset , code2 );
937
- }
938
- /* Move current_ptr past the semicolon */
939
- current_ptr = entity_end_ptr + 1 ;
940
- } else {
941
- /* If the entity is invalid, copy characters from current_ptr up to entity_end_ptr */
942
- if (entity_end_ptr ) {
943
- size_t len = entity_end_ptr - current_ptr ;
944
- memcpy (output_ptr , current_ptr , len );
945
- output_ptr += len ;
946
- current_ptr = entity_end_ptr ;
947
- } else {
948
- * output_ptr ++ = * current_ptr ++ ;
949
- }
950
- }
951
- }
952
-
953
- * output_ptr = '\0' ;
954
- ZSTR_LEN (output ) = (size_t )(output_ptr - ZSTR_VAL (output ));
921
+ if (valid_entity && charset != cs_utf_8 ) {
922
+ /* replace unicode code point */
923
+ if (map_from_unicode (code , charset , & code ) == FAILURE || code2 != 0 )
924
+ valid_entity = false ;
925
+ }
926
+
927
+ if (valid_entity ) {
928
+ /* Write the parsed entity into the output buffer */
929
+ output_ptr += write_octet_sequence ((unsigned char * )output_ptr , charset , code );
930
+ if (code2 ) {
931
+ output_ptr += write_octet_sequence ((unsigned char * )output_ptr , charset , code2 );
932
+ }
933
+ /* Move current_ptr past the semicolon */
934
+ current_ptr = entity_end_ptr + 1 ;
935
+ } else {
936
+ /* If the entity is invalid, copy characters from current_ptr up to entity_end_ptr */
937
+ if (entity_end_ptr ) {
938
+ const size_t len = entity_end_ptr - current_ptr ;
939
+ memcpy (output_ptr , current_ptr , len );
940
+ output_ptr += len ;
941
+ current_ptr = entity_end_ptr ;
942
+ } else {
943
+ * output_ptr ++ = * current_ptr ++ ;
944
+ }
945
+ }
946
+ }
947
+
948
+ * output_ptr = '\0' ;
949
+ ZSTR_LEN (output ) = (size_t )(output_ptr - ZSTR_VAL (output ));
955
950
}
956
951
/* }}} */
957
952
@@ -1036,7 +1031,7 @@ PHPAPI zend_string *php_unescape_html_entities(zend_string *str, int all, int fl
1036
1031
inverse_map = unescape_inverse_map (all , flags );
1037
1032
1038
1033
/* replace numeric entities */
1039
- traverse_for_entities (ZSTR_VAL ( str ), ZSTR_LEN ( str ) , ret , all , flags , inverse_map , charset );
1034
+ traverse_for_entities (str , ret , all , flags , inverse_map , charset );
1040
1035
1041
1036
return ret ;
1042
1037
}
0 commit comments