From 957a668f4c8d6ecf199d5343e87137e28e3b8f3f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20B=C3=B6hm?= <188768+fb55@users.noreply.github.com> Date: Fri, 23 Dec 2022 15:08:12 +0100 Subject: [PATCH 1/3] refactor(options): Revamp option handling htmlparser2 options have been moved out of the root options for Cheerio, making it more obvious which options will take effect. This currently only affects the TypeScript type. Users that want to use htmlparser2 should use the `xml` option: ```js const $ = cheerio.load('', { xml: { withStartIndices: true }}) ``` --- src/api/manipulation.spec.ts | 2 +- src/batteries.spec.ts | 2 +- src/batteries.ts | 6 ++-- src/index.ts | 4 +-- src/load.ts | 10 ++---- src/options.ts | 61 +++++++++++++++++++++++------------- src/parse.spec.ts | 5 +-- src/static.ts | 8 ++--- 8 files changed, 57 insertions(+), 41 deletions(-) diff --git a/src/api/manipulation.spec.ts b/src/api/manipulation.spec.ts index 2706fb2a85..88e1ade906 100644 --- a/src/api/manipulation.spec.ts +++ b/src/api/manipulation.spec.ts @@ -1974,7 +1974,7 @@ describe('$(...)', () => { }); it('() : should preserve parsing options', () => { - const $ = load('
π
', { decodeEntities: false }); + const $ = load('
π
', { xml: { decodeEntities: false } }); const $div = $('div'); expect($div.text()).toBe($div.clone().text()); diff --git a/src/batteries.spec.ts b/src/batteries.spec.ts index a26c7fb47d..3132cc18ef 100644 --- a/src/batteries.spec.ts +++ b/src/batteries.spec.ts @@ -60,7 +60,7 @@ describe('stringStream', () => { }); it('should use htmlparser2 for XML', (cb) => { - const stream = cheerio.stringStream({ xmlMode: true }, (err, $) => { + const stream = cheerio.stringStream({ xml: true }, (err, $) => { expect(err).toBeNull(); expect($.html()).toBe(TEST_HTML); diff --git a/src/batteries.ts b/src/batteries.ts index 5515c80634..e63050a752 100644 --- a/src/batteries.ts +++ b/src/batteries.ts @@ -9,7 +9,7 @@ export * from './index.js'; import type { CheerioAPI, CheerioOptions } from './index.js'; import { load } from './index.js'; -import { flatten as flattenOptions, type InternalOptions } from './options.js'; +import { flattenOptions, type InternalOptions } from './options.js'; import { adapter as htmlparser2Adapter } from 'parse5-htmlparser2-tree-adapter'; // eslint-disable-next-line n/file-extension-in-import @@ -58,7 +58,7 @@ function _stringStream( options: InternalOptions | undefined, cb: (err: Error | null | undefined, $: CheerioAPI) => void ): Writable { - if (options && (options.xmlMode || options._useHtmlParser2)) { + if (options?._useHtmlParser2) { const handler: DomHandler = new DomHandler( (err) => cb(err, load(handler.root)), options @@ -102,6 +102,7 @@ function _stringStream( * writeStream * ); * ``` + * * @param options - The options to pass to Cheerio. * @param cb - The callback to call when the stream is finished. * @returns The writable stream. @@ -176,6 +177,7 @@ const defaultRequestOptions: UndiciStreamOptions = { * * const $ = await cheerio.fromURL('https://example.com'); * ``` + * * @param url - The URL to load the document from. * @param options - The options to pass to Cheerio. * @returns The loaded document. diff --git a/src/index.ts b/src/index.ts index c199b151a1..c3f9e67384 100644 --- a/src/index.ts +++ b/src/index.ts @@ -31,7 +31,7 @@ import renderWithHtmlparser2 from 'dom-serializer'; import { parseDocument as parseWithHtmlparser2 } from 'htmlparser2'; const parse = getParse((content, options, isDocument, context) => - options.xmlMode || options._useHtmlParser2 + options._useHtmlParser2 ? parseWithHtmlparser2(content, options) : parseWithParse5(content, options, isDocument, context) ); @@ -52,7 +52,7 @@ const parse = getParse((content, options, isDocument, context) => * @see {@link https://cheerio.js.org#loading} for additional usage information. */ export const load = getLoad(parse, (dom, options) => - options.xmlMode || options._useHtmlParser2 + options._useHtmlParser2 ? renderWithHtmlparser2(dom, options) : renderWithParse5(dom) ); diff --git a/src/load.ts b/src/load.ts index 23f8a96a60..162d80882c 100644 --- a/src/load.ts +++ b/src/load.ts @@ -1,8 +1,7 @@ import { type CheerioOptions, type InternalOptions, - default as defaultOptions, - flatten as flattenOptions, + flattenOptions, } from './options.js'; import * as staticMethods from './static.js'; import { Cheerio } from './cheerio.js'; @@ -114,7 +113,7 @@ export function getLoad( throw new Error('cheerio.load() expects a string'); } - const internalOpts = { ...defaultOptions, ...flattenOptions(options) }; + const internalOpts = flattenOptions(options); const initialRoot = parse(content, internalOpts, isDocument, null); /** @@ -157,10 +156,7 @@ export function getLoad( // $($) if (selector && isCheerio(selector)) return selector; - const options = { - ...internalOpts, - ...flattenOptions(opts), - }; + const options = flattenOptions(opts, internalOpts); const r = typeof root === 'string' ? [parse(root, options, false, null)] diff --git a/src/options.ts b/src/options.ts index a75b72544d..8a51f8ed2f 100644 --- a/src/options.ts +++ b/src/options.ts @@ -22,10 +22,17 @@ export interface Parse5Options { * Please note that parser-specific options are _only recognized_ if the * relevant parser is used. */ -export interface CheerioOptions extends HTMLParser2Options, Parse5Options { +export interface CheerioOptions extends Parse5Options { /** Recommended way of configuring htmlparser2 when wanting to parse XML. */ xml?: HTMLParser2Options | boolean; + /** + * Enable xml mode, which will switch Cheerio to use htmlparser2. + * + * @deprecated Please use the `xml` option instead. + */ + xmlMode?: boolean; + /** The base URI for the document. Used for the `href` and `src` props. */ baseURI?: string | URL; // eslint-disable-line n/no-unsupported-features/node-builtins @@ -70,7 +77,9 @@ export interface CheerioOptions extends HTMLParser2Options, Parse5Options { } /** Internal options for Cheerio. */ -export interface InternalOptions extends Omit { +export interface InternalOptions + extends HTMLParser2Options, + Omit { /** * Whether to use htmlparser2. * @@ -79,17 +88,8 @@ export interface InternalOptions extends Omit { _useHtmlParser2?: boolean; } -const defaultOpts: CheerioOptions = { - xml: false, - decodeEntities: true, -}; - -/** Cheerio default options. */ -export default defaultOpts; - -const xmlModeDefault: InternalOptions = { - _useHtmlParser2: true, - xmlMode: true, +const defaultOpts: InternalOptions = { + _useHtmlParser2: false, }; /** @@ -98,14 +98,33 @@ const xmlModeDefault: InternalOptions = { * This will set `_useHtmlParser2` to true if `xml` is set to true. * * @param options - The options to flatten. + * @param baseOptions - The base options to use. * @returns The flattened options. */ -export function flatten( - options?: CheerioOptions | null -): InternalOptions | undefined { - return options?.xml - ? typeof options.xml === 'boolean' - ? xmlModeDefault - : { ...xmlModeDefault, ...options.xml } - : options ?? undefined; +export function flattenOptions( + options?: CheerioOptions | null, + baseOptions?: InternalOptions +): InternalOptions { + if (!options) { + return baseOptions ?? defaultOpts; + } + + const opts: InternalOptions = { + _useHtmlParser2: !!options.xmlMode, + ...baseOptions, + ...options, + }; + + if (options.xml) { + opts._useHtmlParser2 = true; + opts.xmlMode = true; + + if (options.xml !== true) { + Object.assign(opts, options.xml); + } + } else if (options.xmlMode) { + opts._useHtmlParser2 = true; + } + + return opts; } diff --git a/src/parse.spec.ts b/src/parse.spec.ts index 6d0f481d47..eb6e3c9aad 100644 --- a/src/parse.spec.ts +++ b/src/parse.spec.ts @@ -1,12 +1,13 @@ import type { Document, Element } from 'domhandler'; import { getParse } from './parse.js'; -import defaultOpts from './options.js'; import { parseDocument as parseWithHtmlparser2 } from 'htmlparser2'; import { parseWithParse5 } from './parsers/parse5-adapter.js'; +const defaultOpts = { _useHtmlParser2: false }; + const parse = getParse((content, options, isDocument, context) => - options.xmlMode || options._useHtmlParser2 + options._useHtmlParser2 ? parseWithHtmlparser2(content, options) : parseWithParse5(content, options, isDocument, context) ); diff --git a/src/static.ts b/src/static.ts index ea016ec601..e26b404cca 100644 --- a/src/static.ts +++ b/src/static.ts @@ -5,8 +5,7 @@ import { textContent } from 'domutils'; import { type InternalOptions, type CheerioOptions, - default as defaultOptions, - flatten as flattenOptions, + flattenOptions as flattenOptions, } from './options.js'; import type { ExtractedMap, ExtractMap } from './api/extract.js'; @@ -85,9 +84,8 @@ export function html( * so fallback non-existing options to the default ones. */ const opts = { - ...defaultOptions, ...this?._options, - ...flattenOptions(options ?? {}), + ...flattenOptions(options), }; return render(this, toRender, opts); @@ -166,7 +164,7 @@ export function parseHTML( keepScripts = context; } - const parsed = this.load(data, defaultOptions, false); + const parsed = this.load(data, this._options, false); if (!keepScripts) { parsed('script').remove(); } From 044c619e0313fd18757a54f86aaadcd78352acb2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20B=C3=B6hm?= <188768+fb55@users.noreply.github.com> Date: Sat, 24 Dec 2022 12:50:42 +0100 Subject: [PATCH 2/3] Improve comments --- src/options.ts | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/options.ts b/src/options.ts index 8a51f8ed2f..9e9e991ef3 100644 --- a/src/options.ts +++ b/src/options.ts @@ -23,17 +23,24 @@ export interface Parse5Options { * relevant parser is used. */ export interface CheerioOptions extends Parse5Options { - /** Recommended way of configuring htmlparser2 when wanting to parse XML. */ + /** + * Recommended way of configuring htmlparser2 when wanting to parse XML. + * + * This will switch Cheerio to use htmlparser2. + * + * @default false + */ xml?: HTMLParser2Options | boolean; /** * Enable xml mode, which will switch Cheerio to use htmlparser2. * * @deprecated Please use the `xml` option instead. + * @default false */ xmlMode?: boolean; - /** The base URI for the document. Used for the `href` and `src` props. */ + /** The base URI for the document. Used to resolve the `href` and `src` props. */ baseURI?: string | URL; // eslint-disable-line n/no-unsupported-features/node-builtins /** From bba5389037d72cd67068266e84d9ae99fb5b19ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20B=C3=B6hm?= <188768+fb55@users.noreply.github.com> Date: Sat, 24 Dec 2022 13:04:52 +0100 Subject: [PATCH 3/3] Allow `treeAdapter` to be overwritten --- src/batteries.ts | 12 ++++++++---- src/cheerio.spec.ts | 29 ++++++++++++----------------- src/parsers/parse5-adapter.ts | 17 +++++++---------- 3 files changed, 27 insertions(+), 31 deletions(-) diff --git a/src/batteries.ts b/src/batteries.ts index e63050a752..6eb9701415 100644 --- a/src/batteries.ts +++ b/src/batteries.ts @@ -67,10 +67,14 @@ function _stringStream( return new Htmlparser2Stream(handler, options); } - const stream = new Parse5Stream({ - ...options, - treeAdapter: htmlparser2Adapter, - }); + options ??= {}; + options.treeAdapter ??= htmlparser2Adapter; + + if (options.scriptingEnabled !== false) { + options.scriptingEnabled = true; + } + + const stream = new Parse5Stream(options); finished(stream, (err) => cb(err, load(stream.document))); diff --git a/src/cheerio.spec.ts b/src/cheerio.spec.ts index f1c8eef9be..1cf9638ede 100644 --- a/src/cheerio.spec.ts +++ b/src/cheerio.spec.ts @@ -452,9 +452,8 @@ describe('cheerio', () => { describe('parse5 options', () => { // Should parse noscript tags only with false option value test('{scriptingEnabled: ???}', () => { - const opt = 'scriptingEnabled'; const options: CheerioOptions = {}; - let result; + let result: Cheerio; // [default] scriptingEnabled: true - tag contains one text element result = cheerio.load(noscript)('noscript'); @@ -463,7 +462,7 @@ describe('cheerio', () => { expect(result[0].children[0].type).toBe('text'); // ScriptingEnabled: false - content of noscript will parsed - options[opt] = false; + options.scriptingEnabled = false; result = cheerio.load(noscript, options)('noscript'); expect(result).toHaveLength(1); expect(result[0].children).toHaveLength(2); @@ -474,7 +473,7 @@ describe('cheerio', () => { // ScriptingEnabled: ??? - should acts as true const values = [undefined, null, 0, '']; for (const val of values) { - options[opt] = val as any; + options.scriptingEnabled = val as any; result = cheerio.load(noscript, options)('noscript'); expect(result).toHaveLength(1); expect(result[0].children).toHaveLength(1); @@ -484,29 +483,25 @@ describe('cheerio', () => { // Should contain location data only with truthful option value test('{sourceCodeLocationInfo: ???}', () => { - const prop = 'sourceCodeLocation'; - const opt = 'sourceCodeLocationInfo'; const options: CheerioOptions = {}; - let result; - let i; // Location data should not be present let values = [undefined, null, 0, false, '']; - for (i = 0; i < values.length; i++) { - options[opt] = values[i] as any; - result = cheerio.load(noscript, options)('noscript'); + for (let i = 0; i < values.length; i++) { + options.sourceCodeLocationInfo = values[i] as any; + const result = cheerio.load(noscript, options)('noscript'); expect(result).toHaveLength(1); - expect(result[0]).not.toHaveProperty(prop); + expect(result[0]).not.toHaveProperty('sourceCodeLocation'); } // Location data should be present values = [true, 1, 'test']; - for (i = 0; i < values.length; i++) { - options[opt] = values[i] as any; - result = cheerio.load(noscript, options)('noscript'); + for (let i = 0; i < values.length; i++) { + options.sourceCodeLocationInfo = values[i] as any; + const result = cheerio.load(noscript, options)('noscript'); expect(result).toHaveLength(1); - expect(result[0]).toHaveProperty(prop); - expect(typeof (result[0] as any)[prop]).toBe('object'); + expect(result[0]).toHaveProperty('sourceCodeLocation'); + expect(typeof (result[0] as any)['sourceCodeLocation']).toBe('object'); } }); }); diff --git a/src/parsers/parse5-adapter.ts b/src/parsers/parse5-adapter.ts index 8bdb32c28a..3d630c94c5 100644 --- a/src/parsers/parse5-adapter.ts +++ b/src/parsers/parse5-adapter.ts @@ -23,18 +23,15 @@ export function parseWithParse5( isDocument: boolean, context: ParentNode | null ): Document { - const opts = { - scriptingEnabled: - typeof options.scriptingEnabled === 'boolean' - ? options.scriptingEnabled - : true, - treeAdapter: htmlparser2Adapter, - sourceCodeLocationInfo: options.sourceCodeLocationInfo, - }; + options.treeAdapter ??= htmlparser2Adapter; + + if (options.scriptingEnabled !== false) { + options.scriptingEnabled = true; + } return isDocument - ? parseDocument(content, opts) - : parseFragment(context, content, opts); + ? parseDocument(content, options) + : parseFragment(context, content, options); } const renderOpts = { treeAdapter: htmlparser2Adapter };