Tuesday, 11 March 2014

Re: [PATCH] increased support for named entities

On Sun, Mar 09, 2014 at 04:11:40PM +0530, rsk1994 wrote:

Thanks for this. I'm going to ignore the generated files and new test
data, as they're uninteresting!

> diff --git a/build/get-entities/getlist.py b/build/get-entities/getlist.py
> new file mode 100644
> index 0000000..a06120b
> --- /dev/null
> +++ b/build/get-entities/getlist.py

What is this file for? It appears to be unused. If it is unnecessary,
please remove it.

> diff --git a/build/make-entities.pl b/build/make-entities.pl
> index 7492052..f9bad7b 100644
> --- a/build/make-entities.pl
> +++ b/build/make-entities.pl
> @@ -4,8 +4,11 @@
> # http://www.opensource.org/licenses/mit-license.php
> # Copyright 2010 Daniel Silverstone <dsilvers@netsurf-browser.org>
> # John-Mark Bell <jmb@netsurf-browser.org>
> +# Rupinder Singh Khokhar <rsk1coder99@gmail.com>
>
> use strict;
> +use utf8;
> +use open ":std", ":encoding(UTF-8)";

I'm not convinced this is necessary, as we shouldn't be emitting
verbatim UTF-8 at any point.

> use constant ENTITIES_FILE => 'build/Entities';
> use constant ENTITIES_INC => 'src/tokeniser/entities.inc';
> @@ -21,8 +24,32 @@ while (my $line = <INFILE>) {
> next if ($line eq '');
> my @elements = split /\s+/, $line;
> my $entity = shift @elements;
> - my $code = shift @elements;
> - $entities{$entity} = $code;
> + my $len = 0;
> + my $code = "";
> + while(@elements) {
> + my $utf8_val = chr(hex(shift(@elements)));

chr() does magical, unexpected, things for values in the range
0x80-0xff. pack("U", hex(shift(@elements)) is almost certainly better.
See below, however.

> + my $backslash_flag = 0;
> + if($utf8_val eq "\n") {
> + $utf8_val = "\\n";
> + $backslash_flag = 1;
> + }
> + if($utf8_val eq "\""
> + || $utf8_val eq "\'"
> + || $utf8_val eq "\\") {
> + $utf8_val = "\\".$utf8_val;
> + $backslash_flag = 1;
> + }
> + $code = $code.$utf8_val;
> + if($backslash_flag) {
> + $len-=1;
> + }
> + }
> +
> + {
> + use Encode;

Unless there's good reason, please declare all module use at the start
of the file.

> + $len += length(Encode::encode_utf8($code));
> + }

There are a couple of issues here:

1) This isn't robust to other control characters
2) In the non \n, ", ', \ case, we're emitting verbatim UTF-8, which
many C compilers can't cope with in string literals.

Thus, I think a different approach is needed. Something like this:

while (@elements) {
my $ucs4 = hex(shift(@entities));
my $utf8 = Encode::encode_utf8(pack('U', $ucs4));

$len += length($utf8);

if ($ucs4 < 0x20 || $ucs4 == 0x22 ||
$ucs4 == 0x27 || $ucs4 == 0x5C ||
$ucs4 >= 0x7f) {
# Unsafe, or non-ASCII: emit escaped
for my $octet (unpack('C*', $utf8)) {
$code = $code . sprintf("\\x%02x", $octet);
}
} else {
# Safe: emit verbatim
$code = $code . $utf8;
}
}

> + $entities{$entity} = "(uint8_t*)\"$code\",".($len);

This wants to be (const uint8_t *)

> }
>
> close(INFILE);
> @@ -113,9 +140,9 @@ foreach my $node (@nodelist) {
> $split = ord($split) if ($split ne '');
> $split = 0 if ($split eq '');
>
> - $value = "0" unless defined($value);
> + $value = "0,0" unless defined($value);

"NULL, 0". Using 0 where the type is a pointer is evil.

> - $output .= "\t{ $split, $lt, $eq, $gt, $value },\n";
> + $output .= "\t{ $split, $lt, $eq, $gt, {$value} },\n";
> }
>
> $output .= "};\n\n";
> diff --git a/src/tokeniser/entities.c b/src/tokeniser/entities.c
> index ac47d80..4ff8625 100644
> --- a/src/tokeniser/entities.c
> +++ b/src/tokeniser/entities.c
> @@ -7,15 +7,20 @@
>
> #include "utils/utils.h"
> #include "tokeniser/entities.h"

Can I have a linebreak here, please?

> +/**
> + * UTF-8 encoding of U+FFFD REPLACEMENT CHARACTER
> + */
> +static const uint8_t u_fffd[3] = { '\xEF', '\xBF', '\xBD' };
> +static const hubbub_string u_fffd_str = { u_fffd, sizeof(u_fffd) };
>
> /** Node in our entity tree */
> typedef struct hubbub_entity_node {
> - /* Do not reorder this without fixing make-entities.pl */
> + /* Do not reorder this without fixing make-entities.pl */
> uint8_t split; /**< Data to split on */
> int32_t lt; /**< Subtree for data less than split */
> int32_t eq; /**< Subtree for data equal to split */
> int32_t gt; /**< Subtree for data greater than split */
> - uint32_t value; /**< Data for this node */
> + hubbub_string value; /**< Data for this node */
> } hubbub_entity_node;
>
> #include "entities.inc"
> @@ -38,7 +43,7 @@ typedef struct hubbub_entity_node {
> * is found.
> */
> static hubbub_error hubbub_entity_tree_search_step(uint8_t c,
> - uint32_t *result, int32_t *context)
> + hubbub_string *result, int32_t *context)
> {
> bool match = false;
> int32_t p;
> @@ -63,7 +68,7 @@ static hubbub_error hubbub_entity_tree_search_step(uint8_t c,
> match = true;
> *result = dict[dict[p].eq].value;
> p = dict[p].eq;
> - } else if (dict[p].value != 0) {
> + } else if (!(dict[p].value.ptr == NULL && dict[p].value.len==0)) {

In general, our style is not to use ! on its own. Instead, express this
as:

if (dict[p].value.ptr != NULL || dict[p].value.len != 0)

However, surely if the pointer is NULL, the length is irrelevant? In
which case, if (dict[p].value.ptr != NULL) will suffice.

> match = true;
> *result = dict[p].value;
> p = dict[p].eq;
> @@ -100,13 +105,13 @@ static hubbub_error hubbub_entity_tree_search_step(uint8_t c,
> * The location pointed to by ::result will be set to U+FFFD unless a match
> * is found.
> */
> -hubbub_error hubbub_entities_search_step(uint8_t c, uint32_t *result,
> +hubbub_error hubbub_entities_search_step(uint8_t c, hubbub_string *result,
> int32_t *context)
> {
> if (result == NULL)
> return HUBBUB_BADPARM;
>
> - *result = 0xFFFD;
> -
> + *result = u_fffd_str;
> +
> return hubbub_entity_tree_search_step(c, result, context);
> }
> diff --git a/src/tokeniser/entities.h b/src/tokeniser/entities.h
> index 0703b37..a8b9bbf 100644
> --- a/src/tokeniser/entities.h
> +++ b/src/tokeniser/entities.h
> @@ -14,7 +14,7 @@
> #include <hubbub/functypes.h>
>
> /* Step-wise search for an entity in the dictionary */
> -hubbub_error hubbub_entities_search_step(uint8_t c, uint32_t *result,
> +hubbub_error hubbub_entities_search_step(uint8_t c, hubbub_string *result,
> int32_t *context);
>
>

No comments:

Post a Comment