Mercurial > addons > firefox-addons > feed-preview
view js/feed-parser.js @ 18:15db49e77deb
Fix parsing the description of RSS 0.9 feeds
Add missing namespace to XPath query.
author | Guido Berhoerster <guido+feed-preview@berhoerster.name> |
---|---|
date | Thu, 13 Dec 2018 08:47:06 +0100 |
parents | 48cabd01ef64 |
children | 3fcd2209b39a |
line wrap: on
line source
/* * Copyright (C) 2018 Guido Berhoerster <guido+feed-preview@berhoerster.name> * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 'use strict'; export const XMLNS = { ATOM03: 'http://purl.org/atom/ns#', ATOM10: 'http://www.w3.org/2005/Atom', RSS09: 'http://my.netscape.com/rdf/simple/0.9/', XHTML: 'http://www.w3.org/1999/xhtml', PARSERERROR: 'http://www.mozilla.org/newlayout/xml/parsererror.xml' } const ALLOWED_LINK_PROTOCOLS = new Set(['http:', 'https:', 'ftp:']); function encodeXML(str) { return str.replace(/[<>&'"]/g, c => { switch (c) { case '<': return '<'; case '>': return '>'; case '&': return '&'; case '\'': return '''; case '"': return '"'; } }); } function parseDate(s) { let date = new Date(s); return isNaN(date) ? new Date(0) : date; } function parseURL(text, baseURL = '') { let url; try { url = new URL(text, baseURL); } catch (e) { return null; } if (!ALLOWED_LINK_PROTOCOLS.has(url.protocol)) { return null; } return url; } function base64Decode(base64Str) { let encodedText; try { encodedText = atob(base64Str); } catch (e) { throw (e instanceof DOMException) ? new TypeError(e.message) : e; } let byteBuffer = new Uint8Array(new ArrayBuffer(encodedText.length)); for (let i = 0; i < encodedText.length; i++) { byteBuffer[i] = encodedText.charCodeAt(i); } return new TextDecoder().decode(byteBuffer); } function feedNSResolver(prefix) { switch (prefix) { case 'atom03': return XMLNS.ATOM03; case 'atom': return XMLNS.ATOM10; case 'rss': return XMLNS.RSS09; } return null; } function feedQueryXPath(feedDocument, scopeElement, xpathQuery) { return feedDocument.evaluate(xpathQuery, scopeElement, feedNSResolver, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue; } function feedQueryXPathAll(feedDocument, scopeElement, xpathQuery) { let result = feedDocument.evaluate(xpathQuery, scopeElement, feedNSResolver, XPathResult.ORDERED_NODE_ITERATOR_TYPE, null); let nodes = []; for (let node = result.iterateNext(); node !== null; node = result.iterateNext()) { nodes.push(node); } return nodes; } export class ParserError extends Error { constructor() { super(...arguments); this.name = this.constructor.name; } } export class UnsupportedFeedTypeError extends Error { constructor(message = 'Document is not a supported feed', ...params) { super(message, ...params); this.name = this.constructor.name; } } export class ProtocolError extends Error { constructor(url, status, statusText, ...params) { let message = `Protocol error: Transfer of ${url} failed with: ` + `${status} ${statusText}` super(message, ...params); this.name = this.constructor.name; this.url = url; this.status = status; this.statusText = statusText; } } class FeedLogo { constructor(url, {title = ''} = {}) { this.url = url; this.title = title; } } class FeedEntryFile { constructor(url, {type = browser.i18n.getMessage('defaultFileType'), size = 0} = {}) { this.filename = undefined; this._url = undefined; this.url = url; this.type = type; this.size = size; } set url(url) { this._url = url; let filename = url.pathname.split('/').pop(); this.filename = filename !== '' ? filename : browser.i18n.getMessage('defaultFileName'); } get url() { return this._url; } } class FeedEntry { constructor({title = browser.i18n.getMessage('defaultFeedEntryTitle'), link = undefined, date = new Date(0), content = '', files = []} = {}) { this.title = title; this.link = link; this.date = date; this._content = undefined; this.content = content; this.files = files; } normalizeContent(text) { if (typeof text === 'undefined') { return } let contentDocument = document.implementation.createHTMLDocument(); let parsedDocument = new DOMParser().parseFromString(text, 'text/html'); contentDocument.body = contentDocument.adoptNode(parsedDocument.body); return new XMLSerializer().serializeToString(contentDocument); } set content(content) { this._content = this.normalizeContent(content); } get content() { return this._content; } } class Feed { constructor(url, {title = browser.i18n.getMessage('defaultFeedTitle'), subtitle = '', logo, entries = []} = {}) { this.url = url; this.title = title; this.subtitle = subtitle; this.logo = logo; this.entries = entries; } } export class FeedParser { static probeFeed(feedDocument) { let documentElement = feedDocument.documentElement; if (documentElement.nodeName === 'feed' && documentElement.namespaceURI === XMLNS.ATOM03) { return ['atom', '0.3']; } else if (documentElement.nodeName === 'feed' && documentElement.namespaceURI === XMLNS.ATOM10) { return ['atom', '1.0']; } else if (documentElement.nodeName === 'rss') { let version = documentElement.getAttribute('version'); switch (version) { case '0.90': case '0.91': case '0.92': case '0.93': case '0.94': case '2.0': return ['rss', version]; } } else if (documentElement.localName.toLowerCase() === 'rdf' && documentElement.getAttribute('xmlns') === XMLNS.RSS09) { return ['rss', '0.9']; } return [undefined, undefined]; } constructor() { this.url = undefined; this.document = undefined; } parseAtom03ContentConstruct(containerElement, textOnly = true) { let contentType = containerElement.getAttribute('type'); let contentMode = containerElement.getAttribute('mode'); if (contentType === null) { contentType = 'text/plain'; } if (contentMode === null) { contentMode = 'xml'; } if (contentType === 'application/xhtml+xml') { let htmlText; if (contentMode === 'xml') { return textOnly ? containerElement.textContent.trim() : containerElement.innerHTML; } else if (contentMode === 'escaped') { htmlText = containerElement.textContent; } else if (contentMode === 'base64') { htmlText = base64Decode(containerElement.textContent); } if (typeof htmlText === 'undefined') { return; } if (textOnly) { let htmlDocument = new DOMParser().parseFromString(htmlText, 'application/xhtml+xml'); if (htmlDocument.documentElement.namespaceURI === XMLNS.PARSERERROR) { return; } return htmlDocument.body.textContent.trim(); } return htmlText; } else if (contentType === 'text/html') { let htmlText; if (contentMode === 'escaped') { htmlText = containerElement.textContent; } else if (contentMode === 'base64') { htmlText = base64Decode(containerElement.textContent); } if (typeof htmlText === 'undefined') { return; } if (textOnly) { let htmlDocument = new DOMParser().parseFromString(htmlText, 'text/html'); return htmlDocument.body.textContent.trim(); } return htmlText; } else if (contentType === 'text/plain') { let text; if (contentMode === 'escaped') { text = containerElement.textContent; } else if (contentMode === 'base64') { text = base64Decode(containerElement.textContent); } if (typeof text === 'undefined') { return; } return textOnly ? text : `<pre>${encodeXML(text)}</pre>`; } return; } parseAtom03Content(contentElement) { // ordered from lowest to highest preference const contentTypes = [ 'text/plain', 'text/html', 'application/xhtml+xml' ]; if (contentElement.getAttribute('type') === 'multipart/alternative' && contentElement.getAttribute('mode') === null) { // select alternative according to above preference let selectedTypeIndex = -1; let selectedElement; for (let innerContentElement of contentElement.children) { if (innerContentElement.localName !== 'content' || innerContentElement.namespaceURI !== XMLNS.ATOM03) { throw new TypeError('child elements of a multipart ' + ' content elements must be content elements'); } let innerContentType = innerContentElement.getAttribute('type'); if (innerContentType === null) { innerContentType = 'text/plain'; } let typeIndex = contentTypes.indexOf(innerContentType); if (typeIndex > selectedTypeIndex) { selectedTypeIndex = typeIndex; selectedElement = innerContentElement; } } if (selectedTypeIndex >= 0) { contentElement = selectedElement; } } return this.parseAtom03ContentConstruct(contentElement, false); } parseAtom03Entry(entryElement) { let title; let link; let date; let content; let titleElement = feedQueryXPath(this.document, entryElement, './atom03:title'); if (titleElement !== null) { title = titleElement.textContent.trim(); } let linkElement = feedQueryXPath(this.document, entryElement, './atom03:link[@href][@rel="alternate"]'); if (linkElement !== null) { link = parseURL(linkElement.getAttribute('href'), this.url); } let modifiedElement = feedQueryXPath(this.document, entryElement, './atom03:modified'); if (modifiedElement !== null) { date = parseDate(modifiedElement.textContent); } let contentElement = feedQueryXPath(this.document, entryElement, './atom03:content'); if (contentElement !== null) { try { content = this.parseAtom03Content(contentElement); } catch (e) { if (!(e instanceof TypeError)) { throw e; } } } if (typeof content === 'undefined') { let summaryElement = feedQueryXPath(this.document, entryElement, './atom03:summary'); if (summaryElement !== null) { try { content = this.parseAtom03ContentConstruct(summaryElement, false); } catch (e) { if (!(e instanceof TypeError)) { throw e; } } } } return new FeedEntry({title, link, date, content}); } parseAtom03Feed() { let title; let subtitle; let logo; let entries = []; let documentElement = this.document.documentElement; let titleElement = feedQueryXPath(this.document, documentElement, './atom03:title'); if (titleElement !== null) { try { title = this.parseAtom03ContentConstruct(titleElement); } catch (e) { if (!(e instanceof TypeError)) { throw e; } } } let taglineElement = feedQueryXPath(this.document, documentElement, './atom03:tagline'); if (taglineElement !== null) { try { title = this.parseAtom03ContentConstruct(taglineElement); } catch (e) { if (!(e instanceof TypeError)) { throw e; } } } let logoElement = feedQueryXPath(this.document, documentElement, './atom03:logo'); if (logoElement !== null) { try { logo = this.parseAtomLogo(logoElement); } catch (e) { if (!(e instanceof TypeError)) { throw e; } } } let entryElements = feedQueryXPathAll(this.document, documentElement, './atom03:entry'); for (let entryElement of entryElements) { entries.push(this.parseAtom03Entry(entryElement)); } return new Feed(this.url, {title, subtitle, logo, entries}); } parseAtomLogo(logoElement) { let url = parseURL(logoElement.textContent.trim(), this.url); if (url === null) { throw new TypeError('invalid URL in <logo> element'); } return new FeedLogo(url); } parseAtomTextConstruct(containerElement, textOnly = true) { let contentType = containerElement.getAttribute('type'); if (contentType === null) { contentType = 'text'; } if (contentType === 'xhtml') { let xhtmlRootElement = containerElement.firstElementChild; if (xhtmlRootElement !== null && xhtmlRootElement.localName === 'div' && xhtmlRootElement.namespaceURI === XMLNS.XHTML) { return textOnly ? xhtmlRootElement.textContent.trim() : xhtmlRootElement.innerHTML; } } else if (contentType === 'html') { let htmlText = containerElement.textContent; if (textOnly) { let htmlDocument = new DOMParser().parseFromString(htmlText, 'text/html'); return htmlDocument.body.textContent.trim(); } return htmlText } else if (contentType === 'text') { let text = containerElement.textContent.trim(); return textOnly ? text : `<pre>${encodeXML(text)}</pre>`; } // unsupported content type return; } parseAtomContent(contentElement) { let contentSrc = contentElement.getAttribute('src'); if (contentSrc !== null) { // externally referenced content is not supported return; } return this.parseAtomTextConstruct(contentElement, false); } parseAtomEntryFile(enclosureElement) { let type; let size; let url = parseURL(enclosureElement.getAttribute('href'), this.url); if (url === null) { throw new TypeError('invalid URL in enclosure href attribute'); } let typeAttribute = enclosureElement.getAttribute('type'); if (typeAttribute !== null) { type = typeAttribute; } let length = parseInt(enclosureElement.getAttribute('length'), 10); if (!isNaN(length)) { size = length; } return new FeedEntryFile(url, {type, size}); } parseAtomEntry(entryElement) { let title; let link; let date; let content; let files = []; let titleElement = feedQueryXPath(this.document, entryElement, './atom:title'); if (titleElement !== null) { title = this.parseAtomTextConstruct(titleElement); } let linkElement = feedQueryXPath(this.document, entryElement, './atom:link[@href][not(@rel) or @rel="alternate"]'); if (linkElement !== null) { link = parseURL(linkElement.getAttribute('href'), this.url); } let updatedElement = feedQueryXPath(this.document, entryElement, './atom:updated'); if (updatedElement !== null) { date = parseDate(updatedElement.textContent); } let contentElement = feedQueryXPath(this.document, entryElement, './atom:content'); if (contentElement !== null) { content = this.parseAtomContent(contentElement); } if (typeof content === 'undefined') { let summaryElement = feedQueryXPath(this.document, entryElement, './atom:summary'); if (summaryElement !== null) { content = this.parseAtomTextConstruct(summaryElement, false); } } for (let enclosureElement of feedQueryXPathAll(this.document, entryElement, './atom:link[@href][@rel="enclosure"]')) { try { let entryFile = this.parseAtomEntryFile(enclosureElement); files.push(entryFile); } catch (e) { if (!(e instanceof TypeError)) { throw e; } } } return new FeedEntry({title, link, date, content, files}); } parseAtomFeed() { let title; let subtitle; let logo; let entries = []; let documentElement = this.document.documentElement; let titleElement = feedQueryXPath(this.document, documentElement, './atom:title'); if (titleElement !== null) { title = this.parseAtomTextConstruct(titleElement); } let subtitleElement = feedQueryXPath(this.document, documentElement, './atom:subtitle'); if (subtitleElement !== null) { subtitle = this.parseAtomTextConstruct(subtitleElement); } let logoElement = feedQueryXPath(this.document, documentElement, './atom:logo'); if (logoElement !== null) { try { logo = this.parseAtomLogo(logoElement); } catch (e) { if (!(e instanceof TypeError)) { throw e; } } } let entryElements = feedQueryXPathAll(this.document, documentElement, './atom:entry'); for (let entryElement of entryElements) { entries.push(this.parseAtomEntry(entryElement)); } return new Feed(this.url, {title, subtitle, logo, entries}); } parseRSS1Logo(imageElement) { let title; let urlElement = feedQueryXPath(this.document, imageElement, './rss:url'); if (urlElement === null) { throw new TypeError('missing <url> element in <logo> element'); } let url = parseURL(urlElement.textContent.trim(), this.url); if (url === null) { throw new TypeError('invalid URL in <logo> element'); } let titleElement = feedQueryXPath(this.document, imageElement, './rss:title'); if (titleElement !== null) { title = titleElement.textContent.trim(); } return new FeedLogo(url, {title}); } parseRSS1Entry(itemElement) { let title; let link; let titleElement = feedQueryXPath(this.document, itemElement, './rss:title'); if (titleElement !== null) { title = titleElement.textContent; } let linkElement = feedQueryXPath(this.document, itemElement, './rss:link'); if (linkElement !== null) { link = parseURL(linkElement.textContent, this.url); } return new FeedEntry({title, link}); } parseRSS1Feed() { let title; let subtitle; let logo; let entries = []; let documentElement = this.document.documentElement; let titleElement = feedQueryXPath(this.document, documentElement, './rss:channel/rss:title'); if (titleElement !== null) { title = titleElement.textContent; } let descriptionElement = feedQueryXPath(this.document, documentElement, './rss:channel/rss:description'); if (descriptionElement !== null) { subtitle = descriptionElement.textContent; } let imageElement = feedQueryXPath(this.document, documentElement, './rss:image'); if (imageElement !== null) { try { logo = this.parseRSS1Logo(imageElement); } catch (e) { if (!(e instanceof TypeError)) { throw e; } } } let itemElements = feedQueryXPathAll(this.document, documentElement, './rss:item'); for (let itemElement of itemElements) { let entry = this.parseRSS1Entry(itemElement); if (typeof entry !== 'undefined') { entries.push(entry); } } return new Feed(this.url, {title, subtitle, logo, entries}); } parseRSS2Logo(imageElement) { let title; let urlElement = feedQueryXPath(this.document, imageElement, './url'); if (urlElement === null) { throw new TypeError('missing <url> element in <logo> element'); } let url = parseURL(urlElement.textContent.trim(), this.url); if (url === null) { throw new TypeError('invalid URL in <logo> element'); } let titleElement = feedQueryXPath(this.document, imageElement, './title'); if (titleElement !== null) { title = titleElement.textContent.trim(); } return new FeedLogo(url, {title}); } parseRSS2EntryFile(enclosureElement) { let type; let size; let url = parseURL(enclosureElement.getAttribute('url'), this.url); if (url === null) { throw new TypeError('invalid URL in <enclosure> element'); } let typeAttribute = enclosureElement.getAttribute('type'); if (typeAttribute !== null) { type = typeAttribute; } let length = parseInt(enclosureElement.getAttribute('length'), 10); if (!isNaN(length)) { size = length; } return new FeedEntryFile(url, {type, size}); } parseRSS2Entry(itemElement) { let title; let link; let date; let content; let files = []; let titleElement = feedQueryXPath(this.document, itemElement, './title'); if (titleElement !== null) { title = titleElement.textContent; } let linkElement = feedQueryXPath(this.document, itemElement, './link'); if (linkElement !== null) { link = parseURL(linkElement.textContent, this.url); } let pubDateElement = feedQueryXPath(this.document, itemElement, './pubDate'); if (pubDateElement !== null) { date = parseDate(pubDateElement.textContent); } let descriptionElement = feedQueryXPath(this.document, itemElement, './description'); if (descriptionElement !== null) { content = descriptionElement.textContent.trim(); } for (let enclosureElement of feedQueryXPathAll(this.document, itemElement, './enclosure')) { try { let entryFile = this.parseRSS2EntryFile(enclosureElement); files.push(entryFile); } catch (e) { if (!(e instanceof TypeError)) { throw e; } } } return new FeedEntry({title, link, date, content, files}); } parseRSS2Feed() { let title; let subtitle; let logo; let entries = []; let documentElement = this.document.documentElement; let titleElement = feedQueryXPath(this.document, documentElement, './channel/title'); if (titleElement !== null) { title = titleElement.textContent; } let descriptionElement = feedQueryXPath(this.document, documentElement, './channel/description'); if (descriptionElement !== null) { subtitle = descriptionElement.textContent; } let imageElement = feedQueryXPath(this.document, documentElement, './channel/image'); if (imageElement !== null) { try { logo = this.parseRSS2Logo(imageElement); } catch (e) { if (!(e instanceof TypeError)) { throw e; } } } let itemElements = feedQueryXPathAll(this.document, documentElement, './channel/item'); for (let itemElement of itemElements) { let entry = this.parseRSS2Entry(itemElement); if (typeof entry !== 'undefined') { entries.push(entry); } } return new Feed(this.url, {title, subtitle, logo, entries}); } parseFromString(xmlString, url) { this.url = url; this.document = new DOMParser().parseFromString(xmlString, 'application/xml'); if (this.document.documentElement.namespaceURI === XMLNS.PARSERERROR) { throw new ParserError(this.document.documentElement.textContent); } let [type, version] = this.constructor.probeFeed(this.document); if (type === 'atom') { if (version === '0.3') { return this.parseAtom03Feed(); } else if (version === '1.0') { return this.parseAtomFeed(); } } else if (type === 'rss') { if (version === '0.9') { return this.parseRSS1Feed(); } else { return this.parseRSS2Feed(); } } throw new UnsupportedFeedTypeError(); } }