Monday, 31 March 2014

[PATCH 06/10] tester fixes & minor bug in tokeniser

---
src/tokeniser/tokeniser.c | 2 +-
test/data/tokeniser2/INDEX | 5 +++-
test/data/tokeniser2/escapeFlag.test | 24 ++++++++--------
test/data/tokeniser2/unicodeChars.test | 8 ------
test/tokeniser2.c | 2 +-
test/tokeniser3.c | 50 ++++++++++++++++++++--------------
6 files changed, 47 insertions(+), 44 deletions(-)

diff --git a/src/tokeniser/tokeniser.c b/src/tokeniser/tokeniser.c
index 8390bf0..41a4b0e 100644
--- a/src/tokeniser/tokeniser.c
+++ b/src/tokeniser/tokeniser.c
@@ -753,7 +753,7 @@ hubbub_error hubbub_tokeniser_handle_data(hubbub_tokeniser *tokeniser)
}

/* Emit a replacement character */
- emit_current_chars(tokeniser);
+ emit_character_token(tokeniser,u_fffd_str);

/* Advance past NUL */
parserutils_inputstream_advance(tokeniser->input, 1);
diff --git a/test/data/tokeniser2/INDEX b/test/data/tokeniser2/INDEX
index 9da56e7..9ff8596 100644
--- a/test/data/tokeniser2/INDEX
+++ b/test/data/tokeniser2/INDEX
@@ -10,6 +10,9 @@ entities.test html5lib entity tests
escapeFlag.test html5lib escape flag tests
numericEntities.test html5lib numeric entities tests
unicodeChars.test html5lib unicode character tests
+#unicodeCharsProblematic.test html5lib problematic unicode character tests
cdata.test CDATA section tests
-regression.test Regression tests
+#regression.test Regression tests
namedEntities.test html5lib named entities tests
+pendingSpecChanges.test html5lib spec changes tests
+#xmlViolation.test xmlViolation
diff --git a/test/data/tokeniser2/escapeFlag.test b/test/data/tokeniser2/escapeFlag.test
index 4c4bf51..18cb430 100644
--- a/test/data/tokeniser2/escapeFlag.test
+++ b/test/data/tokeniser2/escapeFlag.test
@@ -1,33 +1,33 @@
{"tests": [

-{"description":"Commented close tag in [R]CDATA",
-"contentModelFlags":["RCDATA", "CDATA"],
+{"description":"Commented close tag in RCDATA or RAWTEXT",
+"initialStates":["RCDATA state", "RAWTEXT state"],
"lastStartTag":"xmp",
"input":"foo<!--</xmp>--></xmp>",
-"output":[["Character", "foo<!--</xmp>-->"], ["EndTag", "xmp"]]},
+"output":[["Character", "foo<!--"], ["EndTag", "xmp"], ["Character", "-->"], ["EndTag", "xmp"]]},

-{"description":"Bogus comment in [R]CDATA",
-"contentModelFlags":["RCDATA", "CDATA"],
+{"description":"Bogus comment in RCDATA or RAWTEXT",
+"initialStates":["RCDATA state", "RAWTEXT state"],
"lastStartTag":"xmp",
"input":"foo<!-->baz</xmp>",
"output":[["Character", "foo<!-->baz"], ["EndTag", "xmp"]]},

-{"description":"End tag surrounded by bogus comment in [R]CDATA",
-"contentModelFlags":["RCDATA", "CDATA"],
+{"description":"End tag surrounded by bogus comment in RCDATA or RAWTEXT",
+"initialStates":["RCDATA state", "RAWTEXT state"],
"lastStartTag":"xmp",
"input":"foo<!--></xmp><!-->baz</xmp>",
"output":[["Character", "foo<!-->"], ["EndTag", "xmp"], "ParseError", ["Comment", ""], ["Character", "baz"], ["EndTag", "xmp"]]},

{"description":"Commented entities in RCDATA",
-"contentModelFlags":["RCDATA"],
+"initialStates":["RCDATA state"],
"lastStartTag":"xmp",
"input":" &amp; <!-- &amp; --> &amp; </xmp>",
-"output":[["Character", " & <!-- &amp; --> & "], ["EndTag", "xmp"]]},
+"output":[["Character", " & <!-- & --> & "], ["EndTag", "xmp"]]},

-{"description":"Incorrect comment ending sequences in [R]CDATA",
-"contentModelFlags":["RCDATA", "CDATA"],
+{"description":"Incorrect comment ending sequences in RCDATA or RAWTEXT",
+"initialStates":["RCDATA state", "RAWTEXT state"],
"lastStartTag":"xmp",
"input":"foo<!-- x --x>x-- >x--!>x--<></xmp>",
-"output":[["Character", "foo<!-- x --x>x-- >x--!>x--<></xmp>"]]}
+"output":[["Character", "foo<!-- x --x>x-- >x--!>x--<>"], ["EndTag", "xmp"]]}

]}
diff --git a/test/data/tokeniser2/unicodeChars.test b/test/data/tokeniser2/unicodeChars.test
index 9b59015..c778668 100644
--- a/test/data/tokeniser2/unicodeChars.test
+++ b/test/data/tokeniser2/unicodeChars.test
@@ -112,14 +112,6 @@
"input": "\u007F",
"output": ["ParseError", ["Character", "\u007F"]]},

-{"description": "Invalid Unicode character U+D800",
-"input": "\uD800",
-"output": ["ParseError", ["Character", "\uD800"]]},
-
-{"description": "Invalid Unicode character U+DFFF",
-"input": "\uDFFF",
-"output": ["ParseError", ["Character", "\uDFFF"]]},
-
{"description": "Invalid Unicode character U+FDD0",
"input": "\uFDD0",
"output": ["ParseError", ["Character", "\uFDD0"]]},
diff --git a/test/tokeniser2.c b/test/tokeniser2.c
index bf0e69f..3024e81 100644
--- a/test/tokeniser2.c
+++ b/test/tokeniser2.c
@@ -86,7 +86,7 @@ int main(int argc, char **argv)
} else if (strcmp(key, "lastStartTag") == 0) {
ctx.last_start_tag = (const char *)
json_object_get_string(val);
- } else if (strcmp(key, "contentModelFlags") == 0) {
+ } else if (strcmp(key, "initialStates") == 0) {
ctx.content_model =
json_object_get_array(val);
} else if (strcmp(key, "processCDATA") == 0) {
diff --git a/test/tokeniser3.c b/test/tokeniser3.c
index 949ddd0..c4c5231 100644
--- a/test/tokeniser3.c
+++ b/test/tokeniser3.c
@@ -29,6 +29,7 @@ typedef struct context {

static void run_test(context *ctx);
static hubbub_error token_handler(const hubbub_token *token, void *pw);
+size_t get_len(const char *str);

int main(int argc, char **argv)
{
@@ -85,7 +86,7 @@ int main(int argc, char **argv)
} else if (strcmp(key, "lastStartTag") == 0) {
ctx.last_start_tag = (const char *)
json_object_get_string(val);
- } else if (strcmp(key, "contentModelFlags") == 0) {
+ } else if (strcmp(key, "initialStates") == 0) {
ctx.content_model =
json_object_get_array(val);
} else if (strcmp(key, "processCDATA") == 0) {
@@ -103,6 +104,13 @@ int main(int argc, char **argv)
return 0;
}

+size_t get_len(const char *str) {
+ if(str == NULL) {
+ return 0;
+ } else {
+ return strlen(str);
+ }
+}
void run_test(context *ctx)
{
parserutils_inputstream *stream;
@@ -132,7 +140,7 @@ void run_test(context *ctx)

if (ctx->last_start_tag != NULL) {
/* Fake up a start tag, in PCDATA state */
- size_t len = strlen(ctx->last_start_tag) + 3;
+ size_t len = get_len(ctx->last_start_tag) + 3;
uint8_t *buf = malloc(len);

snprintf((char *) buf, len, "<%s>",
@@ -308,21 +316,21 @@ hubbub_error token_handler(const hubbub_token *token, void *pw)
(int) token->data.doctype.system_id.len);
}

- assert(token->data.doctype.name.len == strlen(expname));
- assert(strncmp(gotname, expname, strlen(expname)) == 0);
+ assert(token->data.doctype.name.len == get_len(expname));
+ assert(strncmp(gotname, expname, get_len(expname)) == 0);

assert((exppub == NULL) ==
(token->data.doctype.public_missing == true));
if (exppub) {
- assert(token->data.doctype.public_id.len == strlen(exppub));
- assert(strncmp(gotpub, exppub, strlen(exppub)) == 0);
+ assert(token->data.doctype.public_id.len == get_len(exppub));
+ assert(strncmp(gotpub, exppub, get_len(exppub)) == 0);
}

assert((expsys == NULL) ==
(token->data.doctype.system_missing == true));
if (gotsys) {
- assert(token->data.doctype.system_id.len == strlen(expsys));
- assert(strncmp(gotsys, expsys, strlen(expsys)) == 0);
+ assert(token->data.doctype.system_id.len == get_len(expsys));
+ assert(strncmp(gotsys, expsys, get_len(expsys)) == 0);
}

assert(expquirks == token->data.doctype.force_quirks);
@@ -354,8 +362,8 @@ hubbub_error token_handler(const hubbub_token *token, void *pw)
printf("attributes:\n");
}

- assert(token->data.tag.name.len == strlen(expname));
- assert(strncmp(tagname, expname, strlen(expname)) == 0);
+ assert(token->data.tag.name.len == get_len(expname));
+ assert(strncmp(tagname, expname, get_len(expname)) == 0);

assert((token->data.tag.n_attributes == 0) ==
(expattrs == NULL));
@@ -379,11 +387,11 @@ hubbub_error token_handler(const hubbub_token *token, void *pw)
(int) namelen, gotname,
(int) vallen, gotval);

- assert(namelen == strlen(expname));
+ assert(namelen == get_len(expname));
assert(strncmp(gotname, expname,
- strlen(expname)) == 0);
- assert(vallen == strlen(expval));
- assert(strncmp(gotval, expval, strlen(expval)) == 0);
+ get_len(expname)) == 0);
+ assert(vallen == get_len(expval));
+ assert(strncmp(gotval, expval, get_len(expval)) == 0);

expattrs = expattrs->next;
}
@@ -404,8 +412,8 @@ hubbub_error token_handler(const hubbub_token *token, void *pw)
(token->data.tag.n_attributes > 0) ?
"attributes:" : "");

- assert(token->data.tag.name.len == strlen(expname));
- assert(strncmp(tagname, expname, strlen(expname)) == 0);
+ assert(token->data.tag.name.len == get_len(expname));
+ assert(strncmp(tagname, expname, get_len(expname)) == 0);
}
break;
case HUBBUB_TOKEN_COMMENT:
@@ -419,20 +427,20 @@ hubbub_error token_handler(const hubbub_token *token, void *pw)
printf(" got: '%.*s'\n",
(int) token->data.comment.len, gotstr);

- assert(token->data.comment.len == strlen(expstr));
- assert(strncmp(gotstr, expstr, strlen(expstr)) == 0);
+ assert(token->data.comment.len == get_len(expstr));
+ assert(strncmp(gotstr, expstr, get_len(expstr)) == 0);
}
break;
case HUBBUB_TOKEN_CHARACTER:
{
- int expstrlen = json_object_get_string_len(
+ int expget_len = json_object_get_string_len(
array_list_get_idx(items, 1));
const char *expstr = json_object_get_string(
array_list_get_idx(items, 1));
const char *gotstr = (const char *)
token->data.character.ptr;
size_t len = min(token->data.character.len,
- expstrlen - ctx->char_off);
+ expget_len - ctx->char_off);

printf("expected: '%.*s'\n",
(int) len, expstr + ctx->char_off);
@@ -454,7 +462,7 @@ hubbub_error token_handler(const hubbub_token *token, void *pw)
ctx->char_off = 0;

token_handler(&t, pw);
- } else if (strlen(expstr + ctx->char_off) >
+ } else if (get_len(expstr + ctx->char_off) >
token->data.character.len) {
/* Tokeniser output only contained part of the data
* in the expected token; calculate the offset into
--
1.8.3.2

No comments:

Post a Comment