Skip to content

Commit f093c30

Browse files
committed
CR, refactoring, codestyle
fix logic
1 parent 66f5709 commit f093c30

File tree

1 file changed

+134
-139
lines changed

1 file changed

+134
-139
lines changed

ext/standard/html.c

+134-139
Original file line numberDiff line numberDiff line change
@@ -809,149 +809,144 @@ static inline size_t write_octet_sequence(unsigned char *buf, enum entity_charse
809809
/* +2 is 1 because of rest (probably unnecessary), 1 because of terminating 0 */
810810
#define TRAVERSE_FOR_ENTITIES_EXPAND_SIZE(oldlen) ((oldlen) + (oldlen) / 5 + 2)
811811
static void traverse_for_entities(
812-
const char *input,
813-
size_t input_len,
812+
const zend_string *input,
814813
zend_string *output, /* should have allocated TRAVERSE_FOR_ENTITIES_EXPAND_SIZE(olden) */
815-
int all,
816-
int flags,
814+
const int all,
815+
const int flags,
817816
const entity_ht *inv_map,
818-
enum entity_charset charset)
817+
const enum entity_charset charset)
819818
{
820-
const char *current_ptr = input;
821-
const char *input_end = input + input_len; /* terminator address */
822-
char *output_ptr = ZSTR_VAL(output);
823-
int doctype = flags & ENT_HTML_DOC_TYPE_MASK;
824-
825-
assert(*input_end == '\0');
826-
827-
while (current_ptr < input_end) {
828-
const char *ampersand_ptr = memchr(current_ptr, '&', input_end - current_ptr);
829-
if (!ampersand_ptr) {
830-
size_t tail_len = input_end - current_ptr;
831-
if (tail_len > 0) {
832-
memcpy(output_ptr, current_ptr, tail_len);
833-
output_ptr += tail_len;
834-
}
835-
break;
836-
}
837-
838-
/* Copy everything up to the found '&' */
839-
size_t chunk_len = ampersand_ptr - current_ptr;
840-
if (chunk_len > 0) {
841-
memcpy(output_ptr, current_ptr, chunk_len);
842-
output_ptr += chunk_len;
843-
}
844-
845-
/* Now current_ptr points to the '&' character. */
846-
current_ptr = ampersand_ptr;
847-
848-
/* If there are less than 4 bytes remaining, there isn't enough for an entity – copy '&' as a normal character */
849-
if (current_ptr + 3 >= input_end) {
850-
*output_ptr++ = *current_ptr++;
851-
continue;
852-
}
853-
854-
unsigned code = 0, code2 = 0;
855-
const char *entity_end_ptr = NULL;
856-
int valid_entity = 1;
857-
858-
if (current_ptr[1] == '#') {
859-
/* Processing numeric entity */
860-
const char *num_start = current_ptr + 2;
861-
entity_end_ptr = num_start;
862-
if (process_numeric_entity(&entity_end_ptr, &code) == FAILURE) {
863-
valid_entity = 0;
864-
}
865-
/* If we're in htmlspecialchars_decode, we're only decoding entities
866-
* that represent &, <, >, " and '. Is this one of them? */
867-
if (valid_entity && !all &&
868-
(code > 63U ||
869-
stage3_table_be_apos_00000[code].data.ent.entity == NULL))
870-
{
871-
valid_entity = 0;
872-
}
873-
/* are we allowed to decode this entity in this document type?
874-
* HTML 5 is the only that has a character that cannot be used in
875-
* a numeric entity but is allowed literally (U+000D). The
876-
* unoptimized version would be ... || !numeric_entity_is_allowed(code) */
877-
if (valid_entity && (!unicode_cp_is_allowed(code, doctype) ||
878-
(doctype == ENT_HTML_DOC_HTML5 && code == 0x0D)))
879-
{
880-
valid_entity = 0;
881-
}
882-
} else {
883-
/* Processing named entity */
884-
const char *name_start = current_ptr + 1;
885-
/* Search for ';' */
886-
const char *semi_colon_ptr = memchr(name_start, ';', LONGEST_ENTITY_LENGTH + 1);
887-
if (!semi_colon_ptr) {
888-
valid_entity = 0;
889-
} else {
890-
size_t name_len = semi_colon_ptr - name_start;
891-
if (name_len == 0) {
892-
valid_entity = 0;
893-
} else {
894-
if (resolve_named_entity_html(name_start, name_len, inv_map, &code, &code2) == FAILURE) {
895-
if (doctype == ENT_HTML_DOC_XHTML && name_len == 4 &&
896-
name_start[0] == 'a' && name_start[1] == 'p' &&
897-
name_start[2] == 'o' && name_start[3] == 's')
898-
{
899-
/* uses html4 inv_map, which doesn't include apos;. This is a
819+
const char *current_ptr = ZSTR_VAL(input);
820+
const char *input_end = current_ptr + input->len; /* terminator address */
821+
char *output_ptr = ZSTR_VAL(output);
822+
const int doctype = flags & ENT_HTML_DOC_TYPE_MASK;
823+
824+
while (current_ptr < input_end) {
825+
const char *ampersand_ptr = memchr(current_ptr, '&', input_end - current_ptr);
826+
if (!ampersand_ptr) {
827+
const size_t tail_len = input_end - current_ptr;
828+
if (tail_len > 0) {
829+
memcpy(output_ptr, current_ptr, tail_len);
830+
output_ptr += tail_len;
831+
}
832+
break;
833+
}
834+
835+
/* Copy everything up to the found '&' */
836+
const size_t chunk_len = ampersand_ptr - current_ptr;
837+
if (chunk_len > 0) {
838+
memcpy(output_ptr, current_ptr, chunk_len);
839+
output_ptr += chunk_len;
840+
}
841+
842+
/* Now current_ptr points to the '&' character. */
843+
current_ptr = ampersand_ptr;
844+
845+
/* If there are less than 4 bytes remaining, there isn't enough for an entity – copy '&' as a normal character */
846+
if (input_end - current_ptr < 4){
847+
const size_t remaining = input_end - current_ptr;
848+
memcpy(output_ptr, current_ptr, remaining);
849+
output_ptr += remaining;
850+
break;
851+
}
852+
853+
unsigned code = 0, code2 = 0;
854+
const char *entity_end_ptr = NULL;
855+
bool valid_entity = true;
856+
857+
if (current_ptr[1] == '#') {
858+
/* Processing numeric entity */
859+
const char *num_start = current_ptr + 2;
860+
entity_end_ptr = num_start;
861+
if (process_numeric_entity(&entity_end_ptr, &code) == FAILURE) {
862+
valid_entity = false;
863+
}
864+
if (valid_entity && !all && (code > 63U || stage3_table_be_apos_00000[code].data.ent.entity == NULL)) {
865+
/* If we're in htmlspecialchars_decode, we're only decoding entities
866+
* that represent &, <, >, " and '. Is this one of them? */
867+
valid_entity = false;
868+
} else if (valid_entity && (!unicode_cp_is_allowed(code, doctype) ||
869+
(doctype == ENT_HTML_DOC_HTML5 && code == 0x0D))) {
870+
/* are we allowed to decode this entity in this document type?
871+
* HTML 5 is the only that has a character that cannot be used in
872+
* a numeric entity but is allowed literally (U+000D). The
873+
* unoptimized version would be ... || !numeric_entity_is_allowed(code) */
874+
valid_entity = false;
875+
}
876+
} else {
877+
/* Processing named entity */
878+
const char *name_start = current_ptr + 1;
879+
/* Search for ';' */
880+
const size_t max_search_len = MIN(LONGEST_ENTITY_LENGTH + 1, input_end - name_start);
881+
const char *semi_colon_ptr = memchr(name_start, ';', max_search_len);
882+
if (!semi_colon_ptr) {
883+
valid_entity = false;
884+
} else {
885+
const size_t name_len = semi_colon_ptr - name_start;
886+
if (name_len == 0) {
887+
valid_entity = false;
888+
} else {
889+
if (resolve_named_entity_html(name_start, name_len, inv_map, &code, &code2) == FAILURE) {
890+
if (doctype == ENT_HTML_DOC_XHTML && name_len == 4 &&
891+
name_start[0] == 'a' && name_start[1] == 'p' &&
892+
name_start[2] == 'o' && name_start[3] == 's')
893+
{
894+
/* uses html4 inv_map, which doesn't include apos;. This is a
900895
* hack to support it */
901-
code = (unsigned)'\'';
902-
} else {
903-
valid_entity = 0;
904-
}
905-
}
906-
entity_end_ptr = semi_colon_ptr;
907-
}
908-
}
909-
}
910-
911-
/* If entity_end_ptr is not found or does not point to ';', consider the entity invalid */
912-
if (!valid_entity || entity_end_ptr == NULL || *entity_end_ptr != ';') {
913-
*output_ptr++ = *current_ptr++;
914-
continue;
915-
}
916-
917-
/* Check if quotes are allowed for entities representing ' or " */
918-
if (((code == '\'' && !(flags & ENT_HTML_QUOTE_SINGLE)) ||
919-
(code == '"' && !(flags & ENT_HTML_QUOTE_DOUBLE))))
920-
{
921-
valid_entity = 0;
922-
}
923-
924-
/* UTF-8 doesn't need mapping (ISO-8859-1 doesn't either, but
896+
code = (unsigned)'\'';
897+
} else {
898+
valid_entity = false;
899+
}
900+
}
901+
entity_end_ptr = semi_colon_ptr;
902+
}
903+
}
904+
}
905+
906+
/* If entity_end_ptr is not found or does not point to ';', consider the entity invalid */
907+
if (!valid_entity || entity_end_ptr == NULL || *entity_end_ptr != ';') {
908+
*output_ptr++ = *current_ptr++;
909+
continue;
910+
}
911+
912+
/* Check if quotes are allowed for entities representing ' or " */
913+
if ((code == '\'' && !(flags & ENT_HTML_QUOTE_SINGLE)) ||
914+
(code == '"' && !(flags & ENT_HTML_QUOTE_DOUBLE)))
915+
{
916+
valid_entity = false;
917+
}
918+
919+
/* UTF-8 doesn't need mapping (ISO-8859-1 doesn't either, but
925920
* the call is needed to ensure the codepoint <= U+00FF) */
926-
if (valid_entity && charset != cs_utf_8) {
927-
/* replace unicode code point */
928-
if (map_from_unicode(code, charset, &code) == FAILURE || code2 != 0)
929-
valid_entity = 0;
930-
}
931-
932-
if (valid_entity) {
933-
/* Write the parsed entity into the output buffer */
934-
output_ptr += write_octet_sequence((unsigned char*)output_ptr, charset, code);
935-
if (code2) {
936-
output_ptr += write_octet_sequence((unsigned char*)output_ptr, charset, code2);
937-
}
938-
/* Move current_ptr past the semicolon */
939-
current_ptr = entity_end_ptr + 1;
940-
} else {
941-
/* If the entity is invalid, copy characters from current_ptr up to entity_end_ptr */
942-
if (entity_end_ptr) {
943-
size_t len = entity_end_ptr - current_ptr;
944-
memcpy(output_ptr, current_ptr, len);
945-
output_ptr += len;
946-
current_ptr = entity_end_ptr;
947-
} else {
948-
*output_ptr++ = *current_ptr++;
949-
}
950-
}
951-
}
952-
953-
*output_ptr = '\0';
954-
ZSTR_LEN(output) = (size_t)(output_ptr - ZSTR_VAL(output));
921+
if (valid_entity && charset != cs_utf_8) {
922+
/* replace unicode code point */
923+
if (map_from_unicode(code, charset, &code) == FAILURE || code2 != 0)
924+
valid_entity = false;
925+
}
926+
927+
if (valid_entity) {
928+
/* Write the parsed entity into the output buffer */
929+
output_ptr += write_octet_sequence((unsigned char*)output_ptr, charset, code);
930+
if (code2) {
931+
output_ptr += write_octet_sequence((unsigned char*)output_ptr, charset, code2);
932+
}
933+
/* Move current_ptr past the semicolon */
934+
current_ptr = entity_end_ptr + 1;
935+
} else {
936+
/* If the entity is invalid, copy characters from current_ptr up to entity_end_ptr */
937+
if (entity_end_ptr) {
938+
const size_t len = entity_end_ptr - current_ptr;
939+
memcpy(output_ptr, current_ptr, len);
940+
output_ptr += len;
941+
current_ptr = entity_end_ptr;
942+
} else {
943+
*output_ptr++ = *current_ptr++;
944+
}
945+
}
946+
}
947+
948+
*output_ptr = '\0';
949+
ZSTR_LEN(output) = (size_t)(output_ptr - ZSTR_VAL(output));
955950
}
956951
/* }}} */
957952

@@ -1036,7 +1031,7 @@ PHPAPI zend_string *php_unescape_html_entities(zend_string *str, int all, int fl
10361031
inverse_map = unescape_inverse_map(all, flags);
10371032

10381033
/* replace numeric entities */
1039-
traverse_for_entities(ZSTR_VAL(str), ZSTR_LEN(str), ret, all, flags, inverse_map, charset);
1034+
traverse_for_entities(str, ret, all, flags, inverse_map, charset);
10401035

10411036
return ret;
10421037
}

0 commit comments

Comments
 (0)