diff js/feed-parser.js @ 6:5d7c13e998e9

Create feed previews using a stream filter Instead of replacing the feed document with an XHTML preview from a content script after it has already been rendered, create an XHTML preview using a stream filter before it is passed into the rendering engine and use an XSL style sheet to convert it to HTML. This has two advantages, firstly it results in an HTMLDocument with the full HTML DOM available and secondly it avoids rendering the document twice. Refactor the feed preview creation and split parsing and rendering into seperate modules.
author Guido Berhoerster <guido+feed-preview@berhoerster.name>
date Thu, 08 Nov 2018 16:30:34 +0100
parents
children 2bbb7617dd13
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/js/feed-parser.js	Thu Nov 08 16:30:34 2018 +0100
@@ -0,0 +1,538 @@
+/*
+ * Copyright (C) 2018 Guido Berhoerster <guido+feed-preview@berhoerster.name>
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+'use strict';
+
+export const XMLNS = {
+    ATOM10: 'http://www.w3.org/2005/Atom',
+    RSS09: 'http://my.netscape.com/rdf/simple/0.9/'
+}
+const ALLOWED_LINK_PROTOCOLS = new Set(['http:', 'https:', 'ftp:']);
+
+function encodeXML(str) {
+    return str.replace(/[<>&'"]/g, c => {
+        switch (c) {
+            case '<': return '&lt;';
+            case '>': return '&gt;';
+            case '&': return '&amp;';
+            case '\'': return '&apos;';
+            case '"': return '&quot;';
+        }
+    });
+}
+
+function parseDate(s) {
+    let date = new Date(s);
+
+    return isNaN(date) ? new Date(0) : date;
+}
+
+function parseURL(text, baseURL = '') {
+    let url;
+
+    try {
+        url = new URL(text, baseURL);
+    } catch (e) {
+        return null;
+    }
+    if (!ALLOWED_LINK_PROTOCOLS.has(url.protocol)) {
+        return null;
+    }
+
+    return url;
+}
+
+function feedNSResolver(prefix) {
+    switch (prefix) {
+        case 'atom':
+            return XMLNS.ATOM10;
+        case 'rss':
+            return XMLNS.RSS09;
+    }
+    return null;
+}
+
+function feedQueryXPath(feedDocument, scopeElement, xpathQuery) {
+    return feedDocument.evaluate(xpathQuery, scopeElement, feedNSResolver,
+            XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;
+}
+
+function feedQueryXPathAll(feedDocument, scopeElement, xpathQuery) {
+    let result = feedDocument.evaluate(xpathQuery, scopeElement, feedNSResolver,
+            XPathResult.ORDERED_NODE_ITERATOR_TYPE, null);
+    let nodes = [];
+    for (let node = result.iterateNext(); node !== null;
+            node = result.iterateNext()) {
+        nodes.push(node);
+    }
+
+    return nodes;
+}
+
+export class ParserError extends Error {
+    constructor() {
+        super(...arguments);
+        this.name = this.constructor.name;
+    }
+}
+
+export class UnsupportedFeedTypeError extends Error {
+    constructor(message = 'Document is not a supported feed', ...params) {
+        super(message, ...params);
+        this.name = this.constructor.name;
+    }
+}
+
+export class ProtocolError extends Error {
+    constructor(url, status, statusText, ...params) {
+        let message = `Protocol error: Transfer of ${url} failed with: ` +
+                `${status} ${statusText}`
+        super(message, ...params);
+        this.name = this.constructor.name;
+        this.url = url;
+        this.status = status;
+        this.statusText = statusText;
+    }
+}
+
+class FeedLogo {
+    constructor(url, {title = ''} = {}) {
+        this.url = url;
+        this.title = title;
+    }
+}
+
+class FeedEntryFile {
+    constructor(url, {type = browser.i18n.getMessage('defaultFileType'),
+            size = 0} = {}) {
+        this.filename = undefined;
+        this._url = undefined;
+        this.url = url;
+        this.type = type;
+        this.size = size;
+    }
+
+    set url(url) {
+        this._url = url;
+        let filename = url.pathname.split('/').pop();
+        this.filename = filename !== '' ? filename :
+                browser.i18n.getMessage('defaultFileName');
+    }
+
+    get url() {
+        return this._url;
+    }
+}
+
+class FeedEntry {
+    constructor({title = browser.i18n.getMessage('defaultFeedEntryTitle'),
+            link = undefined, date = new Date(0), content = '',
+            files = []} = {}) {
+        this.title = title;
+        this.link = link;
+        this.date = date;
+        this._content = undefined;
+        this.content = content;
+        this.files = files;
+    }
+
+    normalizeContent(text) {
+        if (typeof text === 'undefined') {
+            return
+        }
+
+        let contentDocument = document.implementation.createHTMLDocument();
+        let parsedDocument = new DOMParser().parseFromString(text, 'text/html');
+        contentDocument.body = contentDocument.adoptNode(parsedDocument.body);
+        return new XMLSerializer().serializeToString(contentDocument);
+    }
+
+    set content(content) {
+        this._content = this.normalizeContent(content);
+    }
+
+    get content() {
+        return this._content;
+    }
+}
+
+class Feed {
+    constructor(url, {title = browser.i18n.getMessage('defaultFeedTitle'),
+            subtitle = '', logo, entries = []} = {}) {
+        this.url = url;
+        this.title = title;
+        this.subtitle = subtitle;
+        this.logo = logo;
+        this.entries = entries;
+    }
+}
+
+export class FeedParser {
+    static probeFeed(feedDocument) {
+        let documentElement = feedDocument.documentElement;
+        if (documentElement.nodeName === 'feed' &&
+                documentElement.namespaceURI === XMLNS.ATOM10) {
+            let version = documentElement.getAttribute('version');
+            if (version === null) {
+                version = '1.0';
+            }
+            if (version === '1.0') {
+                return ['atom', version];
+            }
+        } else if (documentElement.nodeName === 'rss') {
+            let version = documentElement.getAttribute('version');
+            switch (version) {
+                case '0.90':
+                case '0.91':
+                case '0.92':
+                case '0.93':
+                case '0.94':
+                case '2.0':
+                    return ['rss', version];
+            }
+        } else if (documentElement.localName.toLowerCase() === 'rdf' &&
+                documentElement.getAttribute('xmlns') === XMLNS.RSS09) {
+            return ['rss', '0.9'];
+        }
+
+        return [undefined, undefined];
+    }
+
+    constructor() {
+        this.url = undefined;
+        this.document = undefined;
+    }
+
+    parseAtomLogo(logoElement) {
+        let url = parseURL(logoElement.textContent.trim(), this.url);
+        if (url === null) {
+            throw new TypeError('invalid URL in <logo> element');
+        }
+        return new FeedLogo(url);
+    }
+
+    parseAtomEntry(entryElement) {
+        let title;
+        let link;
+        let date;
+        let content;
+        let titleElement = feedQueryXPath(this.document, entryElement,
+                './atom:title');
+        if (titleElement !== null) {
+            title = titleElement.textContent.trim();
+        }
+
+        let linkElement = feedQueryXPath(this.document, entryElement,
+                './atom:link[@href][@rel="alternate"]');
+        if (linkElement !== null) {
+            link = parseURL(linkElement.getAttribute('href'), this.url);
+        }
+
+        let updatedElement = feedQueryXPath(this.document, entryElement,
+                './atom:updated');
+        if (updatedElement !== null) {
+            date = parseDate(updatedElement.textContent);
+        }
+
+        let contentElement = feedQueryXPath(this.document, entryElement,
+                './atom:content');
+        if (contentElement === null) {
+            contentElement = feedQueryXPath(this.document, entryElement,
+                    './atom:summary');
+        }
+        if (contentElement !== null) {
+            let contentType = contentElement.getAttribute('type');
+            if (contentType === null) {
+                contentType = 'text';
+            }
+            contentType = contentType.toLowerCase();
+            if (contentType === 'xhtml') {
+                content = contentElement.innerHTML;
+            } else if (contentType === 'html') {
+                content = contentElement.textContent;
+            } else {
+                let encodedContent =
+                        encodeXML(contentElement.textContent.trim());
+                content = `<pre>${encodedContent}</pre>`;
+            }
+        }
+
+        return new FeedEntry({title, link, date, content});
+    }
+
+    parseAtomFeed() {
+        let title;
+        let subtitle;
+        let logo;
+        let entries = [];
+        let documentElement = this.document.documentElement;
+
+        let titleElement = feedQueryXPath(this.document, documentElement,
+                './atom:title');
+        if (titleElement !== null) {
+            title = titleElement.textContent.trim();
+        }
+
+        let subtitleElement = feedQueryXPath(this.document, documentElement,
+                './atom:subtitle');
+        if (subtitleElement !== null) {
+            subtitle = subtitleElement.textContent.trim();
+        }
+
+        let logoElement = feedQueryXPath(this.document, documentElement,
+                './atom:logo');
+        if (logoElement !== null) {
+            try {
+                logo = this.parseAtomLogo(logoElement);
+            } catch (e) {
+                if (!(e instanceof TypeError)) {
+                    throw e;
+                }
+            }
+        }
+
+        let entryElements = feedQueryXPathAll(this.document, documentElement,
+                './atom:entry');
+        for (let entryElement of entryElements) {
+            entries.push(this.parseAtomEntry(entryElement));
+        }
+
+        return new Feed(this.url, {title, subtitle, logo, entries});
+    }
+
+    parseRSS1Logo(imageElement) {
+        let title;
+        let urlElement = feedQueryXPath(this.document, imageElement,
+                './rss:url');
+        if (urlElement === null) {
+            throw new TypeError('missing <url> element in <logo> element');
+        }
+        let url = parseURL(urlElement.textContent.trim(), this.url);
+        if (url === null) {
+            throw new TypeError('invalid URL in <logo> element');
+        }
+
+        let titleElement = feedQueryXPath(this.document, imageElement,
+                './rss:title');
+        if (titleElement !== null) {
+            title = titleElement.textContent.trim();
+        }
+
+        return new FeedLogo(url, {title});
+    }
+
+    parseRSS1Entry(itemElement) {
+        let title;
+        let link;
+        let titleElement = feedQueryXPath(this.document, itemElement,
+                './rss:title');
+        if (titleElement !== null) {
+            title = titleElement.textContent;
+        }
+
+        let linkElement = feedQueryXPath(this.document, itemElement,
+                './rss:link');
+        if (linkElement !== null) {
+            link = parseURL(linkElement.textContent, this.url);
+        }
+
+        return new FeedEntry({title, link});
+    }
+
+    parseRSS1Feed() {
+        let title;
+        let subtitle;
+        let logo;
+        let entries = [];
+        let documentElement = this.document.documentElement;
+        let titleElement = feedQueryXPath(this.document, documentElement,
+                './rss:channel/rss:title');
+        if (titleElement !== null) {
+            title = titleElement.textContent;
+        }
+
+        let descriptionElement = feedQueryXPath(this.document, documentElement,
+                './channel/description');
+        if (descriptionElement !== null) {
+            subtitle = descriptionElement.textContent;
+        }
+
+        let imageElement = feedQueryXPath(this.document, documentElement,
+                './rss:image');
+        if (imageElement !== null) {
+            try {
+                logo = this.parseRSS1Logo(imageElement);
+            } catch (e) {
+                if (!(e instanceof TypeError)) {
+                    throw e;
+                }
+            }
+        }
+
+        let itemElements = feedQueryXPathAll(this.document, documentElement,
+                './rss:item');
+        for (let itemElement of itemElements) {
+            let entry = this.parseRSS1Entry(itemElement);
+            if (typeof entry !== 'undefined') {
+                entries.push(entry);
+            }
+        }
+
+        return new Feed(this.url, {title, subtitle, logo, entries});
+    }
+
+    parseRSS2Logo(imageElement) {
+        let title;
+        let urlElement = feedQueryXPath(this.document, imageElement, './url');
+        if (urlElement === null) {
+            throw new TypeError('missing <url> element in <logo> element');
+        }
+        let url = parseURL(urlElement.textContent.trim(), this.url);
+        if (url === null) {
+            throw new TypeError('invalid URL in <logo> element');
+        }
+
+        let titleElement = feedQueryXPath(this.document, imageElement,
+                './title');
+        if (titleElement !== null) {
+            title = titleElement.textContent.trim();
+        }
+
+        return new FeedLogo(url, {title});
+    }
+
+    parseRSS2EntryFile(enclosureElement) {
+        let type;
+        let size;
+        let url = parseURL(enclosureElement.getAttribute('url'), this.url);
+        if (url === null) {
+            throw new TypeError('invalid URL in <enclosure> element');
+        }
+
+        let typeAttribute = enclosureElement.getAttribute('type');
+        if (typeAttribute !== null) {
+            type = typeAttribute;
+        }
+
+        let length = parseInt(enclosureElement.getAttribute('length'),
+                10);
+        if (!isNaN(length)) {
+            size = length;
+        }
+
+        return new FeedEntryFile(url, {type, size});
+    }
+
+    parseRSS2Entry(itemElement) {
+        let title;
+        let link;
+        let date;
+        let content;
+        let files = [];
+        let titleElement = feedQueryXPath(this.document, itemElement,
+                './title');
+        if (titleElement !== null) {
+            title = titleElement.textContent;
+        }
+
+        let linkElement = feedQueryXPath(this.document, itemElement, './link');
+        if (linkElement !== null) {
+            link = parseURL(linkElement.textContent, this.url);
+        }
+
+        let pubDateElement = feedQueryXPath(this.document, itemElement,
+                './pubDate');
+        if (pubDateElement !== null) {
+            date = parseDate(pubDateElement.textContent);
+        }
+
+        let descriptionElement = feedQueryXPath(this.document, itemElement,
+                './description');
+        if (descriptionElement !== null) {
+            content = descriptionElement.textContent.trim();
+        }
+
+        for (let enclosureElement of
+                feedQueryXPathAll(this.document, itemElement, './enclosure')) {
+            try {
+                let entryFile = this.parseRSS2EntryFile(enclosureElement);
+                files.push(entryFile);
+            } catch (e) {
+                if (!(e instanceof TypeError)) {
+                    throw e;
+                }
+            }
+        }
+
+        return new FeedEntry({title, link, date, content, files});
+    }
+
+    parseRSS2Feed() {
+        let title;
+        let subtitle;
+        let logo;
+        let entries = [];
+        let documentElement = this.document.documentElement;
+        let titleElement = feedQueryXPath(this.document, documentElement,
+                './channel/title');
+        if (titleElement !== null) {
+            title = titleElement.textContent;
+        }
+
+        let descriptionElement = feedQueryXPath(this.document, documentElement,
+                './channel/description');
+        if (descriptionElement !== null) {
+            subtitle = descriptionElement.textContent;
+        }
+
+        let imageElement = feedQueryXPath(this.document, documentElement,
+                './channel/image');
+        if (imageElement !== null) {
+            try {
+                logo = this.parseRSS2Logo(imageElement);
+            } catch (e) {
+                if (!(e instanceof TypeError)) {
+                    throw e;
+                }
+            }
+        }
+
+        let itemElements = feedQueryXPathAll(this.document, documentElement,
+                './channel/item');
+        for (let itemElement of itemElements) {
+            let entry = this.parseRSS2Entry(itemElement);
+            if (typeof entry !== 'undefined') {
+                entries.push(entry);
+            }
+        }
+
+        return new Feed(this.url, {title, subtitle, logo, entries});
+    }
+
+    parseFromString(xmlString, url) {
+        this.url = url;
+        this.document = new DOMParser().parseFromString(xmlString,
+                'application/xml');
+        if (this.document.documentElement.nodeName.toLowerCase() ===
+                'parsererror') {
+            throw new ParserError(this.document.documentElement.textContent);
+        }
+
+        let [type, version] = this.constructor.probeFeed(this.document);
+        if (type === 'atom') {
+            return this.parseAtomFeed();
+        } else if (type === 'rss') {
+            if (version === '0.9') {
+                return this.parseRSS1Feed();
+            } else {
+                return this.parseRSS2Feed();
+            }
+        }
+        throw new UnsupportedFeedTypeError();
+    }
+}