view js/feed-parser.js @ 14:376a0e415bba

Properly handle non-text content in Atom feed elements The title, subtitle, summary and content elements of Atom feeds can all have non-text content. When parsing title and subtitle elements HTML and XHTML content will be stripped of any markup in order to keep it simple. In summary and content elements markup will be preserved. Element content of any other type as well as remote content in content elements will be ignored.
author Guido Berhoerster <guido+feed-preview@berhoerster.name>
date Mon, 10 Dec 2018 16:38:11 +0100
parents 2bbb7617dd13
children 150f07c7595f
line wrap: on
line source

/*
 * Copyright (C) 2018 Guido Berhoerster <guido+feed-preview@berhoerster.name>
 *
 * This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 */

'use strict';

export const XMLNS = {
    ATOM10: 'http://www.w3.org/2005/Atom',
    RSS09: 'http://my.netscape.com/rdf/simple/0.9/',
    XHTML: 'http://www.w3.org/1999/xhtml'
}
const ALLOWED_LINK_PROTOCOLS = new Set(['http:', 'https:', 'ftp:']);

function encodeXML(str) {
    return str.replace(/[<>&'"]/g, c => {
        switch (c) {
            case '<': return '&lt;';
            case '>': return '&gt;';
            case '&': return '&amp;';
            case '\'': return '&apos;';
            case '"': return '&quot;';
        }
    });
}

function parseDate(s) {
    let date = new Date(s);

    return isNaN(date) ? new Date(0) : date;
}

function parseURL(text, baseURL = '') {
    let url;

    try {
        url = new URL(text, baseURL);
    } catch (e) {
        return null;
    }
    if (!ALLOWED_LINK_PROTOCOLS.has(url.protocol)) {
        return null;
    }

    return url;
}

function feedNSResolver(prefix) {
    switch (prefix) {
        case 'atom':
            return XMLNS.ATOM10;
        case 'rss':
            return XMLNS.RSS09;
    }
    return null;
}

function feedQueryXPath(feedDocument, scopeElement, xpathQuery) {
    return feedDocument.evaluate(xpathQuery, scopeElement, feedNSResolver,
            XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;
}

function feedQueryXPathAll(feedDocument, scopeElement, xpathQuery) {
    let result = feedDocument.evaluate(xpathQuery, scopeElement, feedNSResolver,
            XPathResult.ORDERED_NODE_ITERATOR_TYPE, null);
    let nodes = [];
    for (let node = result.iterateNext(); node !== null;
            node = result.iterateNext()) {
        nodes.push(node);
    }

    return nodes;
}

export class ParserError extends Error {
    constructor() {
        super(...arguments);
        this.name = this.constructor.name;
    }
}

export class UnsupportedFeedTypeError extends Error {
    constructor(message = 'Document is not a supported feed', ...params) {
        super(message, ...params);
        this.name = this.constructor.name;
    }
}

export class ProtocolError extends Error {
    constructor(url, status, statusText, ...params) {
        let message = `Protocol error: Transfer of ${url} failed with: ` +
                `${status} ${statusText}`
        super(message, ...params);
        this.name = this.constructor.name;
        this.url = url;
        this.status = status;
        this.statusText = statusText;
    }
}

class FeedLogo {
    constructor(url, {title = ''} = {}) {
        this.url = url;
        this.title = title;
    }
}

class FeedEntryFile {
    constructor(url, {type = browser.i18n.getMessage('defaultFileType'),
            size = 0} = {}) {
        this.filename = undefined;
        this._url = undefined;
        this.url = url;
        this.type = type;
        this.size = size;
    }

    set url(url) {
        this._url = url;
        let filename = url.pathname.split('/').pop();
        this.filename = filename !== '' ? filename :
                browser.i18n.getMessage('defaultFileName');
    }

    get url() {
        return this._url;
    }
}

class FeedEntry {
    constructor({title = browser.i18n.getMessage('defaultFeedEntryTitle'),
            link = undefined, date = new Date(0), content = '',
            files = []} = {}) {
        this.title = title;
        this.link = link;
        this.date = date;
        this._content = undefined;
        this.content = content;
        this.files = files;
    }

    normalizeContent(text) {
        if (typeof text === 'undefined') {
            return
        }

        let contentDocument = document.implementation.createHTMLDocument();
        let parsedDocument = new DOMParser().parseFromString(text, 'text/html');
        contentDocument.body = contentDocument.adoptNode(parsedDocument.body);
        return new XMLSerializer().serializeToString(contentDocument);
    }

    set content(content) {
        this._content = this.normalizeContent(content);
    }

    get content() {
        return this._content;
    }
}

class Feed {
    constructor(url, {title = browser.i18n.getMessage('defaultFeedTitle'),
            subtitle = '', logo, entries = []} = {}) {
        this.url = url;
        this.title = title;
        this.subtitle = subtitle;
        this.logo = logo;
        this.entries = entries;
    }
}

export class FeedParser {
    static probeFeed(feedDocument) {
        let documentElement = feedDocument.documentElement;
        if (documentElement.nodeName === 'feed' &&
                documentElement.namespaceURI === XMLNS.ATOM10) {
            let version = documentElement.getAttribute('version');
            if (version === null) {
                version = '1.0';
            }
            if (version === '1.0') {
                return ['atom', version];
            }
        } else if (documentElement.nodeName === 'rss') {
            let version = documentElement.getAttribute('version');
            switch (version) {
                case '0.90':
                case '0.91':
                case '0.92':
                case '0.93':
                case '0.94':
                case '2.0':
                    return ['rss', version];
            }
        } else if (documentElement.localName.toLowerCase() === 'rdf' &&
                documentElement.getAttribute('xmlns') === XMLNS.RSS09) {
            return ['rss', '0.9'];
        }

        return [undefined, undefined];
    }

    constructor() {
        this.url = undefined;
        this.document = undefined;
    }

    parseAtomLogo(logoElement) {
        let url = parseURL(logoElement.textContent.trim(), this.url);
        if (url === null) {
            throw new TypeError('invalid URL in <logo> element');
        }
        return new FeedLogo(url);
    }

    parseAtomTextConstruct(containerElement, textOnly = true) {
        let contentType = containerElement.getAttribute('type');
        if (contentType === null) {
            contentType = 'text';
        }

        if (contentType === 'xhtml') {
            let xhtmlRootElement = containerElement.firstElementChild;
            if (xhtmlRootElement !== null &&
                    xhtmlRootElement.localName === 'div' &&
                    xhtmlRootElement.namespaceURI === XMLNS.XHTML) {
                return textOnly ? xhtmlRootElement.textContent.trim() :
                        xhtmlRootElement.innerHTML;
            }
        } else if (contentType === 'html') {
            let htmlText = containerElement.textContent;
            if (textOnly) {
                let htmlDocument = new DOMParser().parseFromString(htmlText,
                        'text/html');
                return htmlDocument.body.textContent.trim();
            }
            return htmlText
        } else if (contentType === 'text') {
            let text = containerElement.textContent.trim();
            return textOnly ? text : `<pre>${encodeXML(text)}</pre>`;
        }

        // unsupported content type
        return;
    }

    parseAtomContent(contentElement) {
        let contentSrc = contentElement.getAttribute('src');
        if (contentSrc !== null) {
            // externally referenced content is not supported
            return;
        }
        return this.parseAtomTextConstruct(contentElement, false);
    }

    parseAtomEntry(entryElement) {
        let title;
        let link;
        let date;
        let content;
        let titleElement = feedQueryXPath(this.document, entryElement,
                './atom:title');
        if (titleElement !== null) {
            title = this.parseAtomTextConstruct(titleElement);
        }

        let linkElement = feedQueryXPath(this.document, entryElement,
                './atom:link[@href][not(@rel) or @rel="alternate"]');
        if (linkElement !== null) {
            link = parseURL(linkElement.getAttribute('href'), this.url);
        }

        let updatedElement = feedQueryXPath(this.document, entryElement,
                './atom:updated');
        if (updatedElement !== null) {
            date = parseDate(updatedElement.textContent);
        }

        let contentElement = feedQueryXPath(this.document, entryElement,
                './atom:content');
        if (contentElement !== null) {
            content = this.parseAtomContent(contentElement);
        }
        if (typeof content === 'undefined') {
            let summaryElement = feedQueryXPath(this.document, entryElement,
                    './atom:summary');
            if (summaryElement !== null) {
                content = this.parseAtomTextConstruct(summaryElement, false);
            }
        }

        return new FeedEntry({title, link, date, content});
    }

    parseAtomFeed() {
        let title;
        let subtitle;
        let logo;
        let entries = [];
        let documentElement = this.document.documentElement;

        let titleElement = feedQueryXPath(this.document, documentElement,
                './atom:title');
        if (titleElement !== null) {
            title = this.parseAtomTextConstruct(titleElement);
        }

        let subtitleElement = feedQueryXPath(this.document, documentElement,
                './atom:subtitle');
        if (subtitleElement !== null) {
            subtitle = this.parseAtomTextConstruct(subtitleElement);
        }

        let logoElement = feedQueryXPath(this.document, documentElement,
                './atom:logo');
        if (logoElement !== null) {
            try {
                logo = this.parseAtomLogo(logoElement);
            } catch (e) {
                if (!(e instanceof TypeError)) {
                    throw e;
                }
            }
        }

        let entryElements = feedQueryXPathAll(this.document, documentElement,
                './atom:entry');
        for (let entryElement of entryElements) {
            entries.push(this.parseAtomEntry(entryElement));
        }

        return new Feed(this.url, {title, subtitle, logo, entries});
    }

    parseRSS1Logo(imageElement) {
        let title;
        let urlElement = feedQueryXPath(this.document, imageElement,
                './rss:url');
        if (urlElement === null) {
            throw new TypeError('missing <url> element in <logo> element');
        }
        let url = parseURL(urlElement.textContent.trim(), this.url);
        if (url === null) {
            throw new TypeError('invalid URL in <logo> element');
        }

        let titleElement = feedQueryXPath(this.document, imageElement,
                './rss:title');
        if (titleElement !== null) {
            title = titleElement.textContent.trim();
        }

        return new FeedLogo(url, {title});
    }

    parseRSS1Entry(itemElement) {
        let title;
        let link;
        let titleElement = feedQueryXPath(this.document, itemElement,
                './rss:title');
        if (titleElement !== null) {
            title = titleElement.textContent;
        }

        let linkElement = feedQueryXPath(this.document, itemElement,
                './rss:link');
        if (linkElement !== null) {
            link = parseURL(linkElement.textContent, this.url);
        }

        return new FeedEntry({title, link});
    }

    parseRSS1Feed() {
        let title;
        let subtitle;
        let logo;
        let entries = [];
        let documentElement = this.document.documentElement;
        let titleElement = feedQueryXPath(this.document, documentElement,
                './rss:channel/rss:title');
        if (titleElement !== null) {
            title = titleElement.textContent;
        }

        let descriptionElement = feedQueryXPath(this.document, documentElement,
                './channel/description');
        if (descriptionElement !== null) {
            subtitle = descriptionElement.textContent;
        }

        let imageElement = feedQueryXPath(this.document, documentElement,
                './rss:image');
        if (imageElement !== null) {
            try {
                logo = this.parseRSS1Logo(imageElement);
            } catch (e) {
                if (!(e instanceof TypeError)) {
                    throw e;
                }
            }
        }

        let itemElements = feedQueryXPathAll(this.document, documentElement,
                './rss:item');
        for (let itemElement of itemElements) {
            let entry = this.parseRSS1Entry(itemElement);
            if (typeof entry !== 'undefined') {
                entries.push(entry);
            }
        }

        return new Feed(this.url, {title, subtitle, logo, entries});
    }

    parseRSS2Logo(imageElement) {
        let title;
        let urlElement = feedQueryXPath(this.document, imageElement, './url');
        if (urlElement === null) {
            throw new TypeError('missing <url> element in <logo> element');
        }
        let url = parseURL(urlElement.textContent.trim(), this.url);
        if (url === null) {
            throw new TypeError('invalid URL in <logo> element');
        }

        let titleElement = feedQueryXPath(this.document, imageElement,
                './title');
        if (titleElement !== null) {
            title = titleElement.textContent.trim();
        }

        return new FeedLogo(url, {title});
    }

    parseRSS2EntryFile(enclosureElement) {
        let type;
        let size;
        let url = parseURL(enclosureElement.getAttribute('url'), this.url);
        if (url === null) {
            throw new TypeError('invalid URL in <enclosure> element');
        }

        let typeAttribute = enclosureElement.getAttribute('type');
        if (typeAttribute !== null) {
            type = typeAttribute;
        }

        let length = parseInt(enclosureElement.getAttribute('length'),
                10);
        if (!isNaN(length)) {
            size = length;
        }

        return new FeedEntryFile(url, {type, size});
    }

    parseRSS2Entry(itemElement) {
        let title;
        let link;
        let date;
        let content;
        let files = [];
        let titleElement = feedQueryXPath(this.document, itemElement,
                './title');
        if (titleElement !== null) {
            title = titleElement.textContent;
        }

        let linkElement = feedQueryXPath(this.document, itemElement, './link');
        if (linkElement !== null) {
            link = parseURL(linkElement.textContent, this.url);
        }

        let pubDateElement = feedQueryXPath(this.document, itemElement,
                './pubDate');
        if (pubDateElement !== null) {
            date = parseDate(pubDateElement.textContent);
        }

        let descriptionElement = feedQueryXPath(this.document, itemElement,
                './description');
        if (descriptionElement !== null) {
            content = descriptionElement.textContent.trim();
        }

        for (let enclosureElement of
                feedQueryXPathAll(this.document, itemElement, './enclosure')) {
            try {
                let entryFile = this.parseRSS2EntryFile(enclosureElement);
                files.push(entryFile);
            } catch (e) {
                if (!(e instanceof TypeError)) {
                    throw e;
                }
            }
        }

        return new FeedEntry({title, link, date, content, files});
    }

    parseRSS2Feed() {
        let title;
        let subtitle;
        let logo;
        let entries = [];
        let documentElement = this.document.documentElement;
        let titleElement = feedQueryXPath(this.document, documentElement,
                './channel/title');
        if (titleElement !== null) {
            title = titleElement.textContent;
        }

        let descriptionElement = feedQueryXPath(this.document, documentElement,
                './channel/description');
        if (descriptionElement !== null) {
            subtitle = descriptionElement.textContent;
        }

        let imageElement = feedQueryXPath(this.document, documentElement,
                './channel/image');
        if (imageElement !== null) {
            try {
                logo = this.parseRSS2Logo(imageElement);
            } catch (e) {
                if (!(e instanceof TypeError)) {
                    throw e;
                }
            }
        }

        let itemElements = feedQueryXPathAll(this.document, documentElement,
                './channel/item');
        for (let itemElement of itemElements) {
            let entry = this.parseRSS2Entry(itemElement);
            if (typeof entry !== 'undefined') {
                entries.push(entry);
            }
        }

        return new Feed(this.url, {title, subtitle, logo, entries});
    }

    parseFromString(xmlString, url) {
        this.url = url;
        this.document = new DOMParser().parseFromString(xmlString,
                'application/xml');
        if (this.document.documentElement.nodeName.toLowerCase() ===
                'parsererror') {
            throw new ParserError(this.document.documentElement.textContent);
        }

        let [type, version] = this.constructor.probeFeed(this.document);
        if (type === 'atom') {
            return this.parseAtomFeed();
        } else if (type === 'rss') {
            if (version === '0.9') {
                return this.parseRSS1Feed();
            } else {
                return this.parseRSS2Feed();
            }
        }
        throw new UnsupportedFeedTypeError();
    }
}