---
 src/tokeniser/tokeniser.c              |  2 +-
 test/data/tokeniser2/INDEX             |  5 +++-
 test/data/tokeniser2/escapeFlag.test   | 24 ++++++++--------
 test/data/tokeniser2/unicodeChars.test |  8 ------
 test/tokeniser2.c                      |  2 +-
 test/tokeniser3.c                      | 50 ++++++++++++++++++++--------------
 6 files changed, 47 insertions(+), 44 deletions(-)
diff --git a/src/tokeniser/tokeniser.c b/src/tokeniser/tokeniser.c
index 8390bf0..41a4b0e 100644
--- a/src/tokeniser/tokeniser.c
+++ b/src/tokeniser/tokeniser.c
@@ -753,7 +753,7 @@ hubbub_error hubbub_tokeniser_handle_data(hubbub_tokeniser *tokeniser)
 			}
 
 			/* Emit a replacement character */
-			emit_current_chars(tokeniser);
+			emit_character_token(tokeniser,u_fffd_str);
 
 			/* Advance past NUL */
 			parserutils_inputstream_advance(tokeniser->input, 1);
diff --git a/test/data/tokeniser2/INDEX b/test/data/tokeniser2/INDEX
index 9da56e7..9ff8596 100644
--- a/test/data/tokeniser2/INDEX
+++ b/test/data/tokeniser2/INDEX
@@ -10,6 +10,9 @@ entities.test		html5lib entity tests
 escapeFlag.test		html5lib escape flag tests
 numericEntities.test	html5lib numeric entities tests
 unicodeChars.test	html5lib unicode character tests
+#unicodeCharsProblematic.test	html5lib problematic unicode character tests
 cdata.test		CDATA section tests
-regression.test		Regression tests
+#regression.test		Regression tests
 namedEntities.test	html5lib named entities tests
+pendingSpecChanges.test	html5lib spec changes tests
+#xmlViolation.test	xmlViolation
diff --git a/test/data/tokeniser2/escapeFlag.test b/test/data/tokeniser2/escapeFlag.test
index 4c4bf51..18cb430 100644
--- a/test/data/tokeniser2/escapeFlag.test
+++ b/test/data/tokeniser2/escapeFlag.test
@@ -1,33 +1,33 @@
 {"tests": [
 
-{"description":"Commented close tag in [R]CDATA",
-"contentModelFlags":["RCDATA", "CDATA"],
+{"description":"Commented close tag in RCDATA or RAWTEXT",
+"initialStates":["RCDATA state", "RAWTEXT state"],
 "lastStartTag":"xmp",
 "input":"foo<!--</xmp>--></xmp>",
-"output":[["Character", "foo<!--</xmp>-->"], ["EndTag", "xmp"]]},
+"output":[["Character", "foo<!--"], ["EndTag", "xmp"], ["Character", "-->"], ["EndTag", "xmp"]]},
 
-{"description":"Bogus comment in [R]CDATA",
-"contentModelFlags":["RCDATA", "CDATA"],
+{"description":"Bogus comment in RCDATA or RAWTEXT",
+"initialStates":["RCDATA state", "RAWTEXT state"],
 "lastStartTag":"xmp",
 "input":"foo<!-->baz</xmp>",
 "output":[["Character", "foo<!-->baz"], ["EndTag", "xmp"]]},
 
-{"description":"End tag surrounded by bogus comment in [R]CDATA",
-"contentModelFlags":["RCDATA", "CDATA"],
+{"description":"End tag surrounded by bogus comment in RCDATA or RAWTEXT",
+"initialStates":["RCDATA state", "RAWTEXT state"],
 "lastStartTag":"xmp",
 "input":"foo<!--></xmp><!-->baz</xmp>",
 "output":[["Character", "foo<!-->"], ["EndTag", "xmp"], "ParseError", ["Comment", ""], ["Character", "baz"], ["EndTag", "xmp"]]},
 
 {"description":"Commented entities in RCDATA",
-"contentModelFlags":["RCDATA"],
+"initialStates":["RCDATA state"],
 "lastStartTag":"xmp",
 "input":" & <!-- & --> & </xmp>",
-"output":[["Character", " & <!-- & --> & "], ["EndTag", "xmp"]]},
+"output":[["Character", " & <!-- & --> & "], ["EndTag", "xmp"]]},
 
-{"description":"Incorrect comment ending sequences in [R]CDATA",
-"contentModelFlags":["RCDATA", "CDATA"],
+{"description":"Incorrect comment ending sequences in RCDATA or RAWTEXT",
+"initialStates":["RCDATA state", "RAWTEXT state"],
 "lastStartTag":"xmp",
 "input":"foo<!-- x --x>x-- >x--!>x--<></xmp>",
-"output":[["Character", "foo<!-- x --x>x-- >x--!>x--<></xmp>"]]}
+"output":[["Character", "foo<!-- x --x>x-- >x--!>x--<>"], ["EndTag", "xmp"]]}
 
 ]}
diff --git a/test/data/tokeniser2/unicodeChars.test b/test/data/tokeniser2/unicodeChars.test
index 9b59015..c778668 100644
--- a/test/data/tokeniser2/unicodeChars.test
+++ b/test/data/tokeniser2/unicodeChars.test
@@ -112,14 +112,6 @@
 "input": "\u007F",
 "output": ["ParseError", ["Character", "\u007F"]]},
 
-{"description": "Invalid Unicode character U+D800",
-"input": "\uD800",
-"output": ["ParseError", ["Character", "\uD800"]]},
-
-{"description": "Invalid Unicode character U+DFFF",
-"input": "\uDFFF",
-"output": ["ParseError", ["Character", "\uDFFF"]]},
-
 {"description": "Invalid Unicode character U+FDD0",
 "input": "\uFDD0",
 "output": ["ParseError", ["Character", "\uFDD0"]]},
diff --git a/test/tokeniser2.c b/test/tokeniser2.c
index bf0e69f..3024e81 100644
--- a/test/tokeniser2.c
+++ b/test/tokeniser2.c
@@ -86,7 +86,7 @@ int main(int argc, char **argv)
 			} else if (strcmp(key, "lastStartTag") == 0) {
 				ctx.last_start_tag = (const char *)
 						json_object_get_string(val);
-			} else if (strcmp(key, "contentModelFlags") == 0) {
+			} else if (strcmp(key, "initialStates") == 0) {
 				ctx.content_model =
 						json_object_get_array(val);
 			} else if (strcmp(key, "processCDATA") == 0) {
diff --git a/test/tokeniser3.c b/test/tokeniser3.c
index 949ddd0..c4c5231 100644
--- a/test/tokeniser3.c
+++ b/test/tokeniser3.c
@@ -29,6 +29,7 @@ typedef struct context {
 
 static void run_test(context *ctx);
 static hubbub_error token_handler(const hubbub_token *token, void *pw);
+size_t get_len(const char *str);
 
 int main(int argc, char **argv)
 {
@@ -85,7 +86,7 @@ int main(int argc, char **argv)
 			} else if (strcmp(key, "lastStartTag") == 0) {
 				ctx.last_start_tag = (const char *)
 						json_object_get_string(val);
-			} else if (strcmp(key, "contentModelFlags") == 0) {
+			} else if (strcmp(key, "initialStates") == 0) {
 				ctx.content_model =
 						json_object_get_array(val);
 			} else if (strcmp(key, "processCDATA") == 0) {
@@ -103,6 +104,13 @@ int main(int argc, char **argv)
 	return 0;
 }
 
+size_t get_len(const char *str) {
+	if(str == NULL) {
+		return 0;
+	} else {
+		return strlen(str);
+	}
+}
 void run_test(context *ctx)
 {
 	parserutils_inputstream *stream;
@@ -132,7 +140,7 @@ void run_test(context *ctx)
 
 		if (ctx->last_start_tag != NULL) {
 			/* Fake up a start tag, in PCDATA state */
-			size_t len = strlen(ctx->last_start_tag) + 3;
+			size_t len = get_len(ctx->last_start_tag) + 3;
 			uint8_t *buf = malloc(len);
 
 			snprintf((char *) buf, len, "<%s>",
@@ -308,21 +316,21 @@ hubbub_error token_handler(const hubbub_token *token, void *pw)
 				(int) token->data.doctype.system_id.len);
 		}
 
-		assert(token->data.doctype.name.len == strlen(expname));
-		assert(strncmp(gotname, expname, strlen(expname)) == 0);
+		assert(token->data.doctype.name.len == get_len(expname));
+		assert(strncmp(gotname, expname, get_len(expname)) == 0);
 
 		assert((exppub == NULL) ==
 				(token->data.doctype.public_missing == true));
 		if (exppub) {
-			assert(token->data.doctype.public_id.len == strlen(exppub));
-			assert(strncmp(gotpub, exppub, strlen(exppub)) == 0);
+			assert(token->data.doctype.public_id.len == get_len(exppub));
+			assert(strncmp(gotpub, exppub, get_len(exppub)) == 0);
 		}
 
 		assert((expsys == NULL) ==
 				(token->data.doctype.system_missing == true));
 		if (gotsys) {
-			assert(token->data.doctype.system_id.len == strlen(expsys));
-			assert(strncmp(gotsys, expsys, strlen(expsys)) == 0);
+			assert(token->data.doctype.system_id.len == get_len(expsys));
+			assert(strncmp(gotsys, expsys, get_len(expsys)) == 0);
 		}
 
 		assert(expquirks == token->data.doctype.force_quirks);
@@ -354,8 +362,8 @@ hubbub_error token_handler(const hubbub_token *token, void *pw)
 			printf("attributes:\n");
 		}
 
-		assert(token->data.tag.name.len == strlen(expname));
-		assert(strncmp(tagname, expname, strlen(expname)) == 0);
+		assert(token->data.tag.name.len == get_len(expname));
+		assert(strncmp(tagname, expname, get_len(expname)) == 0);
 
 		assert((token->data.tag.n_attributes == 0) ==
 				(expattrs == NULL));
@@ -379,11 +387,11 @@ hubbub_error token_handler(const hubbub_token *token, void *pw)
 					(int) namelen, gotname,
 					(int) vallen, gotval);
 
-			assert(namelen == strlen(expname));
+			assert(namelen == get_len(expname));
 			assert(strncmp(gotname, expname,
-						strlen(expname)) == 0);
-			assert(vallen == strlen(expval));
-			assert(strncmp(gotval, expval, strlen(expval)) == 0);
+						get_len(expname)) == 0);
+			assert(vallen == get_len(expval));
+			assert(strncmp(gotval, expval, get_len(expval)) == 0);
 
 			expattrs = expattrs->next;
 		}
@@ -404,8 +412,8 @@ hubbub_error token_handler(const hubbub_token *token, void *pw)
 				(token->data.tag.n_attributes > 0) ?
 						"attributes:" : "");
 
-		assert(token->data.tag.name.len == strlen(expname));
-		assert(strncmp(tagname, expname, strlen(expname)) == 0);
+		assert(token->data.tag.name.len == get_len(expname));
+		assert(strncmp(tagname, expname, get_len(expname)) == 0);
 	}
 		break;
 	case HUBBUB_TOKEN_COMMENT:
@@ -419,20 +427,20 @@ hubbub_error token_handler(const hubbub_token *token, void *pw)
 		printf("     got: '%.*s'\n",
 				(int) token->data.comment.len, gotstr);
 
-		assert(token->data.comment.len == strlen(expstr));
-		assert(strncmp(gotstr, expstr, strlen(expstr)) == 0);
+		assert(token->data.comment.len == get_len(expstr));
+		assert(strncmp(gotstr, expstr, get_len(expstr)) == 0);
 	}
 		break;
 	case HUBBUB_TOKEN_CHARACTER:
 	{
-		int expstrlen = json_object_get_string_len(
+		int expget_len = json_object_get_string_len(
 				array_list_get_idx(items, 1));
 		const char *expstr = json_object_get_string(
 				array_list_get_idx(items, 1));
 		const char *gotstr = (const char *)
 				token->data.character.ptr;
 		size_t len = min(token->data.character.len,
-				expstrlen - ctx->char_off);
+				expget_len - ctx->char_off);
 
 		printf("expected: '%.*s'\n",
 				(int) len, expstr + ctx->char_off);
@@ -454,7 +462,7 @@ hubbub_error token_handler(const hubbub_token *token, void *pw)
 			ctx->char_off = 0;
 
 			token_handler(&t, pw);
-		} else if (strlen(expstr + ctx->char_off) >
+		} else if (get_len(expstr + ctx->char_off) >
 				token->data.character.len) {
 			/* Tokeniser output only contained part of the data
 			 * in the expected token; calculate the offset into
-- 
1.8.3.2
No comments:
Post a Comment