8000 fix: allow case-insensitive DOCTYPE in HTML by karfau · Pull Request #819 · xmldom/xmldom · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content

fix: allow case-insensitive DOCTYPE in HTML #819

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jan 19, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion lib/conventions.js
Original file line number Diff line number Diff line change
Expand Up @@ -324,7 +324,7 @@ var MIME_TYPE = freeze({
XML_APPLICATION: 'application/xml',

/**
* `text/html`, an alias for `application/xml`.
* `text/xml`, an alias for `application/xml`.
*
* @see https://tools.ietf.org/html/rfc7303#section-9.2 RFC 7303
* @see https://www.iana.org/assignments/media-types/text/xml IANA MimeType registration
Expand Down
5 changes: 5 additions & 0 deletions lib/grammar.js
Original file line number Diff line number Diff line change
Expand Up @@ -381,6 +381,9 @@ var ATTLIST_DECL_START = '<!ATTLIST';
// to support XML without namespaces in DTD we can not restrict it to QName
var AttlistDecl = reg(ATTLIST_DECL_START, S, Name, AttDef, '*', S_OPT, '>');

// https://html.spec.whatwg.org/multipage/urls-and-fetching.html#about:legacy-compat
var ABOUT_LEGACY_COMPAT = 'about:legacy-compat';
var ABOUT_LEGACY_COMPAT_SystemLiteral = regg('"' + ABOUT_LEGACY_COMPAT + '"', '|', "'" + ABOUT_LEGACY_COMPAT + "'");
var SYSTEM = 'SYSTEM';
var PUBLIC = 'PUBLIC';
// https://www.w3.org/TR/xml11/#NT-ExternalID
Expand Down Expand Up @@ -494,6 +497,8 @@ exports.chars_without = chars_without;
exports.detectUnicodeSupport = detectUnicodeSupport;
exports.reg = reg;
exports.regg = regg;
exports.ABOUT_LEGACY_COMPAT = ABOUT_LEGACY_COMPAT;
exports.ABOUT_LEGACY_COMPAT_SystemLiteral = ABOUT_LEGACY_COMPAT_SystemLiteral;
exports.AttlistDecl = AttlistDecl;
exports.CDATA_START = CDATA_START;
exports.CDATA_END = CDATA_END;
Expand Down
40 changes: 33 additions & 7 deletions lib/sax.js
Original file line number Diff line number Diff line change
Expand Up @@ -587,8 +587,10 @@ function _copy(source, target) {
* @property {function(): string} substringFromIndex
* creates a substring from the current index to the end of `source`
* @property {function(compareWith: string): boolean} substringStartsWith
* Checks if source contains `compareWith`,
* starting from the current index.
* Checks if `source` contains `compareWith`, starting from the current index.
* @property {function(compareWith: string): boolean} substringStartsWithCaseInsensitive
* Checks if `source` contains `compareWith`, starting from the current index,
* comparing the upper case of both sides.
* @see {@link parseUtils}
*/

Expand Down Expand Up @@ -634,6 +636,9 @@ function parseUtils(source, start) {
function substringStartsWith(text) {
r 8000 eturn source.substring(index, index + text.length) === text;
}
function substringStartsWithCaseInsensitive(text) {
return source.substring(index, index + text.length).toUpperCase() === text.toUpperCase();
}

function getMatch(args) {
var expr = g.reg('^', args);
Expand All @@ -657,6 +662,7 @@ function parseUtils(source, start) {
skipBlanks: skipBlanks,
substringFromIndex: substringFromIndex,
substringStartsWith: substringStartsWith,
substringStartsWithCaseInsensitive: substringStartsWithCaseInsensitive,
};
}

Expand Down Expand Up @@ -753,7 +759,7 @@ function parseDoctypeInternalSubset(p, errorHandler) {
function parseDoctypeCommentOrCData(source, start, domBuilder, errorHandler, isHTML) {
var p = parseUtils(source, start);

switch (p.char(2)) {
switch (isHTML ? p.char(2).toUpperCase() : p.char(2)) {
case '-':
// should be a comment
var comment = p.getMatch(g.Comment);
Expand Down Expand Up @@ -782,7 +788,7 @@ function parseDoctypeCommentOrCData(source, start, domBuilder, errorHandler, isH
if (domBuilder.doc && domBuilder.doc.documentElement) {
return errorHandler.fatalError('Doctype not allowed inside or after documentElement at position ' + p.getIndex());
}
if (!p.substringStartsWith(g.DOCTYPE_DECL_START)) {
if (isHTML ? !p.substringStartsWithCaseInsensitive(g.DOCTYPE_DECL_START) : !p.substringStartsWith(g.DOCTYPE_DECL_START)) {
return errorHandler.fatalError('Expected ' + g.DOCTYPE_DECL_START + ' at position ' + p.getIndex());
}
p.skip(g.DOCTYPE_DECL_START.length);
Expand All @@ -800,6 +806,10 @@ function parseDoctypeCommentOrCData(source, start, domBuilder, errorHandler, isH
doctype.name = p.getMatch(g.Name);
if (!doctype.name)
return errorHandler.fatalError('doctype name missing or contains unexpected characters at position ' + p.getIndex());

if (isHTML && doctype.name.toLowerCase() !== 'html') {
errorHandler.warning('Unexpected DOCTYPE in HTML document at position ' + p.getIndex());
}
p.skipBlanks();

// Check for ExternalID
Expand All @@ -815,10 +825,26 @@ function parseDoctypeCommentOrCData(source, start, domBuilder, errorHandler, isH
doctype.publicId = match.groups.PubidLiteral;
}
p.skip(match[0].length);
} else if (isHTML && p.substringStartsWithCaseInsensitive(g.SYSTEM)) {
// https://html.spec.whatwg.org/multipage/syntax.html#doctype-legacy-string
p.skip(g.SYSTEM.length);
if (p.skipBlanks() < 1) {
return errorHandler.fatalError('Expected whitespace after ' + g.SYSTEM + ' at position ' + p.getIndex());
}
doctype.systemId = p.getMatch(g.ABOUT_LEGACY_COMPAT_SystemLiteral);
if (!doctype.systemId) {
return errorHandler.fatalError(
'Expected ' + g.ABOUT_LEGACY_COMPAT + ' in single or double quotes after ' + g.SYSTEM + ' at position ' + p.getIndex()
);
}
}
if (isHTML && doctype.systemId && !g.ABOUT_LEGACY_COMPAT_SystemLiteral.test(doctype.systemId)) {
errorHandler.warning('Unexpected doctype.systemId in HTML document at position ' + p.getIndex());
}
if (!isHTML) {
p.skipBlanks();
doctype.internalSubset = parseDoctypeInternalSubset(p, errorHandler);
}

p.skipBlanks();
doctype.internalSubset = parseDoctypeInternalSubset(p, errorHandler);
p.skipBlanks();
if (p.char() !== '>') {
return errorHandler.fatalError('doctype not terminated with > at position ' + p.getIndex());
Expand Down
1 change: 1 addition & 0 deletions test/grammar/__snapshots__/regexp.test.js.snap
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ exports[`all grammar regular expressions should have the expected keys 1`] = `
"S",
"S_OPT",
"SystemLiteral",
"ABOUT_LEGACY_COMPAT_SystemLiteral",
"Char",
"CDSect",
"PubidLiteral",
Expand Down
1 change: 1 addition & 0 deletions test/grammar/regexp.js

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 1 addition & 3 deletions test/parse/node.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -179,9 +179,7 @@ describe('XML Node Parse', () => {
});

test('preserves doctype with public id and sysid', () => {
const DOCTYPE =
'<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"' +
' "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">';
const DOCTYPE = `<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">`;

const actual = new DOMParser().parseFromString(`${DOCTYPE}<html/>`, 'text/html').toString();

Expand Down
127 changes: 127 additions & 0 deletions test/sax/parseDoctypeInternalSubset.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -167,4 +167,131 @@ describe('parseDoctypeCommentOrCData', () => {
D54 expect(domBuilder.startDTD).toHaveBeenCalledWith(Name, '"pubId"', '"sysId"', internalSubset);
expect(domBuilder.endDTD).toHaveBeenCalled();
});
describe('when isHtml is true', () => {
const html = 'html';
const HTML = 'HTML';
const isHtml = true;
test('should report fatal error and return with incomplete DOCTYPE decl', () => {
const start = 0;
var source = '<!d';
const errorHandler = { fatalError: jest.fn() };

const returned = parseDoctypeCommentOrCData(source, start, { doc: {} }, errorHandler, isHtml);

expect(returned).toBe(undefined);
expect(errorHandler.fatalError).toHaveBeenCalledWith(expect.stringContaining(g.DOCTYPE_DECL_START));
});
test('should report warning when doctype name is not html', () => {
const start = 0;
var source = '<!doctype fantasy>';
const domBuilder = { startDTD: jest.fn(), endDTD: jest.fn() };
const errorHandler = { warning: jest.fn() };

const returned = parseDoctypeCommentOrCData(source, start, domBuilder, errorHandler, isHtml);

expect(returned).toBe(source.length);
expect(errorHandler.warning).toHaveBeenCalledWith(expect.stringContaining('Unexpected DOCTYPE in HTML document'));
});

it('should accept upper case doctype and name', () => {
const source = `${g.DOCTYPE_DECL_START} ${HTML}>`;
const domBuilder = { startDTD: jest.fn(), endDTD: jest.fn() };
const returned = parseDoctypeCommentOrCData(source, 0, domBuilder, {}, isHtml);

expect(returned).toBe(source.length);
expect(domBuilder.startDTD).toHaveBeenCalledWith(HTML, undefined, undefined, undefined);
expect(domBuilder.endDTD).toHaveBeenCalled();
});
it('should accept lower case doctype and name', () => {
const source = `${g.DOCTYPE_DECL_START.toLowerCase()} ${html}>`;
const domBuilder = { startDTD: jest.fn(), endDTD: jest.fn() };
const returned = parseDoctypeCommentOrCData(source, 0, domBuilder, {}, isHtml);

expect(returned).toBe(source.length);
expect(domBuilder.startDTD).toHaveBeenCalledWith(html, undefined, undefined, undefined);
expect(domBuilder.endDTD).toHaveBeenCalled();
});
it('should accept mixed case doctype and name', () => {
const source = `<!DocType Html>`;
const domBuilder = { startDTD: jest.fn(), endDTD: jest.fn() };
const returned = parseDoctypeCommentOrCData(source, 0, domBuilder, {}, isHtml);

expect(returned).toBe(source.length);
expect(domBuilder.startDTD).toHaveBeenCalledWith('Html', undefined, undefined, undefined);
expect(domBuilder.endDTD).toHaveBeenCalled();
});
it(`should accept and preserve doctype with lower case system and '${g.ABOUT_LEGACY_COMPAT}'`, () => {
const source = `${g.DOCTYPE_DECL_START} ${HTML} system '${g.ABOUT_LEGACY_COMPAT}'>`;
const domBuilder = { startDTD: jest.fn(), endDTD: jest.fn() };
const returned = parseDoctypeCommentOrCData(source, 0, domBuilder, {}, isHtml);

expect(returned).toBe(source.length);
expect(domBuilder.startDTD).toHaveBeenCalledWith(HTML, undefined, `'${g.ABOUT_LEGACY_COMPAT}'`, undefined);
expect(domBuilder.endDTD).toHaveBeenCalled();
});
it(`should accept and preserve doctype with upper case system and "${g.ABOUT_LEGACY_COMPAT}"`, () => {
const source = `${g.DOCTYPE_DECL_START} ${HTML} ${g.SYSTEM} "${g.ABOUT_LEGACY_COMPAT}">`;
const domBuilder = { startDTD: jest.fn(), endDTD: jest.fn() };
const returned = parseDoctypeCommentOrCData(source, 0, domBuilder, {}, isHtml);

expect(returned).toBe(source.length);
expect(domBuilder.startDTD).toHaveBeenCalledWith(HTML, undefined, `"${g.ABOUT_LEGACY_COMPAT}"`, undefined);
expect(domBuilder.endDTD).toHaveBeenCalled();
});
it(`should report fatal error if system is lower case and systemId is not ${g.ABOUT_LEGACY_COMPAT}`, () => {
const source = `${g.DOCTYPE_DECL_START} ${HTML} ${g.SYSTEM.toLowerCase()} "whatever">`;
const domBuilder = { startDTD: jest.fn(), endDTD: jest.fn() };
const errorHandler = { fatalError: jest.fn() };

const returned = parseDoctypeCommentOrCData(source, 0, domBuilder, errorHandler, isHtml);

expect(errorHandler.fatalError).toHaveBeenCalledWith(
expect.stringContaining('Expected ' + g.ABOUT_LEGACY_COMPAT + ' in single or double quotes after ' + g.SYSTEM)
);
expect(returned).toBeUndefined();
expect(domBuilder.startDTD).not.toHaveBeenCalled();
expect(domBuilder.endDTD).not.toHaveBeenCalled();
});
it(`should report fatal error and return if system is lower case and is not followed by whitespace`, () => {
const source = `${g.DOCTYPE_DECL_START} ${HTML} ${g.SYSTEM.toLowerCase()}"${g.ABOUT_LEGACY_COMPAT}">`;
const domBuilder = { startDTD: jest.fn(), endDTD: jest.fn() };
const errorHandler = { fatalError: jest.fn() };

const returned = parseDoctypeCommentOrCData(source, 0, domBuilder, errorHandler, isHtml);

expect(returned).toBeUndefined();
expect(errorHandler.fatalError).toHaveBeenCalledWith(expect.stringContaining(''));
expect(domBuilder.startDTD).not.toHaveBeenCalledWith(HTML, undefined, `"${g.ABOUT_LEGACY_COMPAT}"`, undefined);
expect(domBuilder.endDTD).not.toHaveBeenCalled();
});
it('should accept and preserve XHTML doctype', () => {
const source = `<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">`;
const domBuilder = { startDTD: jest.fn(), endDTD: jest.fn() };
const errorHandler = { warning: jest.fn() };

const returned = parseDoctypeCommentOrCData(source, 0, domBuilder, errorHandler, isHtml);

expect(returned).toBe(source.length);
expect(errorHandler.warning).toHaveBeenCalledWith(expect.stringContaining('Unexpected doctype.systemId in HTML document'));
expect(domBuilder.startDTD).toHaveBeenCalledWith(
html,
'"-//W3C//DTD XHTML 1.0 Transitional//EN"',
'"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"',
undefined
);
expect(domBuilder.endDTD).toHaveBeenCalled();
});
it('should fail on doctype with DTD', () => {
const source = `${g.DOCTYPE_DECL_START} ${HTML} ${g.SYSTEM} "${g.ABOUT_LEGACY_COMPAT}" [<!ENTITY foo "foo">]>`;
const domBuilder = { startDTD: jest.fn(), endDTD: jest.fn() };
const errorHandler = { fatalError: jest.fn() };

const returned = parseDoctypeCommentOrCData(source, 0, domBuilder, errorHandler, isHtml);

expect(returned).toBeUndefined();
expect(errorHandler.fatalError).toHaveBeenCalledWith(expect.stringContaining('doctype not terminated with > at position'));
expect(domBuilder.startDTD).not.toHaveBeenCalled();
expect(domBuilder.endDTD).not.toHaveBeenCalled();
});
});
});
Loading
0