view js/feed-parser.js @ 19:3fcd2209b39a

Add support for RSS 1.0 feeds
author Guido Berhoerster <guido+feed-preview@berhoerster.name>
date Thu, 13 Dec 2018 09:02:40 +0100
parents 15db49e77deb
children da483ce3832d
line wrap: on
line source

/*
 * Copyright (C) 2018 Guido Berhoerster <guido+feed-preview@berhoerster.name>
 *
 * This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 */

'use strict';

export const XMLNS = {
    ATOM03: 'http://purl.org/atom/ns#',
    ATOM10: 'http://www.w3.org/2005/Atom',
    RSS09: 'http://my.netscape.com/rdf/simple/0.9/',
    RSS10: 'http://purl.org/rss/1.0/',
    XHTML: 'http://www.w3.org/1999/xhtml',
    PARSERERROR: 'http://www.mozilla.org/newlayout/xml/parsererror.xml'
}
const ALLOWED_LINK_PROTOCOLS = new Set(['http:', 'https:', 'ftp:']);

function encodeXML(str) {
    return str.replace(/[<>&'"]/g, c => {
        switch (c) {
            case '<': return '&lt;';
            case '>': return '&gt;';
            case '&': return '&amp;';
            case '\'': return '&apos;';
            case '"': return '&quot;';
        }
    });
}

function parseDate(s) {
    let date = new Date(s);

    return isNaN(date) ? new Date(0) : date;
}

function parseURL(text, baseURL = '') {
    let url;

    try {
        url = new URL(text, baseURL);
    } catch (e) {
        return null;
    }
    if (!ALLOWED_LINK_PROTOCOLS.has(url.protocol)) {
        return null;
    }

    return url;
}

function base64Decode(base64Str) {
    let encodedText;
    try {
        encodedText = atob(base64Str);
    } catch (e) {
        throw (e instanceof DOMException) ? new TypeError(e.message) : e;
    }
    let byteBuffer = new Uint8Array(new ArrayBuffer(encodedText.length));
    for (let i = 0; i < encodedText.length; i++) {
        byteBuffer[i] = encodedText.charCodeAt(i);
    }
    return new TextDecoder().decode(byteBuffer);
}

function feedNSResolver(prefix) {
    switch (prefix) {
        case 'atom03':
            return XMLNS.ATOM03;
        case 'atom':
            return XMLNS.ATOM10;
        case 'rss09':
            return XMLNS.RSS09;
        case 'rss10':
            return XMLNS.RSS10;
    }
    return null;
}

function feedQueryXPath(feedDocument, scopeElement, xpathQuery) {
    return feedDocument.evaluate(xpathQuery, scopeElement, feedNSResolver,
            XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;
}

function feedQueryXPathAll(feedDocument, scopeElement, xpathQuery) {
    let result = feedDocument.evaluate(xpathQuery, scopeElement, feedNSResolver,
            XPathResult.ORDERED_NODE_ITERATOR_TYPE, null);
    let nodes = [];
    for (let node = result.iterateNext(); node !== null;
            node = result.iterateNext()) {
        nodes.push(node);
    }

    return nodes;
}

export class ParserError extends Error {
    constructor() {
        super(...arguments);
        this.name = this.constructor.name;
    }
}

export class UnsupportedFeedTypeError extends Error {
    constructor(message = 'Document is not a supported feed', ...params) {
        super(message, ...params);
        this.name = this.constructor.name;
    }
}

export class ProtocolError extends Error {
    constructor(url, status, statusText, ...params) {
        let message = `Protocol error: Transfer of ${url} failed with: ` +
                `${status} ${statusText}`
        super(message, ...params);
        this.name = this.constructor.name;
        this.url = url;
        this.status = status;
        this.statusText = statusText;
    }
}

class FeedLogo {
    constructor(url, {title = ''} = {}) {
        this.url = url;
        this.title = title;
    }
}

class FeedEntryFile {
    constructor(url, {type = browser.i18n.getMessage('defaultFileType'),
            size = 0} = {}) {
        this.filename = undefined;
        this._url = undefined;
        this.url = url;
        this.type = type;
        this.size = size;
    }

    set url(url) {
        this._url = url;
        let filename = url.pathname.split('/').pop();
        this.filename = filename !== '' ? filename :
                browser.i18n.getMessage('defaultFileName');
    }

    get url() {
        return this._url;
    }
}

class FeedEntry {
    constructor({title = browser.i18n.getMessage('defaultFeedEntryTitle'),
            link = undefined, date = new Date(0), content = '',
            files = []} = {}) {
        this.title = title;
        this.link = link;
        this.date = date;
        this._content = undefined;
        this.content = content;
        this.files = files;
    }

    normalizeContent(text) {
        if (typeof text === 'undefined') {
            return
        }

        let contentDocument = document.implementation.createHTMLDocument();
        let parsedDocument = new DOMParser().parseFromString(text, 'text/html');
        contentDocument.body = contentDocument.adoptNode(parsedDocument.body);
        return new XMLSerializer().serializeToString(contentDocument);
    }

    set content(content) {
        this._content = this.normalizeContent(content);
    }

    get content() {
        return this._content;
    }
}

class Feed {
    constructor(url, {title = browser.i18n.getMessage('defaultFeedTitle'),
            subtitle = '', logo, entries = []} = {}) {
        this.url = url;
        this.title = title;
        this.subtitle = subtitle;
        this.logo = logo;
        this.entries = entries;
    }
}

export class FeedParser {
    static probeFeed(feedDocument) {
        let documentElement = feedDocument.documentElement;
        if (documentElement.nodeName === 'feed' &&
                documentElement.namespaceURI === XMLNS.ATOM03) {
            return ['atom', '0.3'];
        } else if (documentElement.nodeName === 'feed' &&
                documentElement.namespaceURI === XMLNS.ATOM10) {
            return ['atom', '1.0'];
        } else if (documentElement.nodeName === 'rss') {
            let version = documentElement.getAttribute('version');
            switch (version) {
                case '0.90':
                case '0.91':
                case '0.92':
                case '0.93':
                case '0.94':
                case '2.0':
                    return ['rss', version];
            }
        } else if (documentElement.localName.toLowerCase() === 'rdf' &&
                documentElement.getAttribute('xmlns') === XMLNS.RSS09) {
            return ['rss', '0.9'];
        } else if (documentElement.localName.toLowerCase() === 'rdf' &&
                documentElement.getAttribute('xmlns') === XMLNS.RSS10) {
            return ['rss', '1.0'];
        }

        return [undefined, undefined];
    }

    constructor() {
        this.url = undefined;
        this.document = undefined;
    }

    parseAtom03ContentConstruct(containerElement, textOnly = true) {
        let contentType = containerElement.getAttribute('type');
        let contentMode = containerElement.getAttribute('mode');
        if (contentType === null) {
            contentType = 'text/plain';
        }
        if (contentMode === null) {
            contentMode = 'xml';
        }
        if (contentType === 'application/xhtml+xml') {
            let htmlText;
            if (contentMode === 'xml') {
                return textOnly ? containerElement.textContent.trim() :
                        containerElement.innerHTML;
            } else if (contentMode === 'escaped') {
                htmlText = containerElement.textContent;
            } else if (contentMode === 'base64') {
                htmlText = base64Decode(containerElement.textContent);
            }
            if (typeof htmlText === 'undefined') {
                return;
            }
            if (textOnly) {
                let htmlDocument = new DOMParser().parseFromString(htmlText,
                        'application/xhtml+xml');
                if (htmlDocument.documentElement.namespaceURI ===
                        XMLNS.PARSERERROR) {
                    return;
                }
                return htmlDocument.body.textContent.trim();
            }
            return htmlText;
        } else if (contentType === 'text/html') {
            let htmlText;
            if (contentMode === 'escaped') {
                htmlText = containerElement.textContent;
            } else if (contentMode === 'base64') {
                htmlText = base64Decode(containerElement.textContent);
            }
            if (typeof htmlText === 'undefined') {
                return;
            }
            if (textOnly) {
                let htmlDocument = new DOMParser().parseFromString(htmlText,
                        'text/html');
                return htmlDocument.body.textContent.trim();
            }
            return htmlText;
        } else if (contentType === 'text/plain') {
            let text;
            if (contentMode === 'escaped') {
                text = containerElement.textContent;
            } else if (contentMode === 'base64') {
                text = base64Decode(containerElement.textContent);
            }
            if (typeof text === 'undefined') {
                return;
            }
            return textOnly ? text : `<pre>${encodeXML(text)}</pre>`;
        }
        return;
    }

    parseAtom03Content(contentElement) {
        // ordered from lowest to highest preference
        const contentTypes = [
            'text/plain',
            'text/html',
            'application/xhtml+xml'
        ];
        if (contentElement.getAttribute('type') === 'multipart/alternative' &&
                contentElement.getAttribute('mode') === null) {
            // select alternative according to above preference
            let selectedTypeIndex = -1;
            let selectedElement;
            for (let innerContentElement of contentElement.children) {
                if (innerContentElement.localName !== 'content' ||
                        innerContentElement.namespaceURI !== XMLNS.ATOM03) {
                    throw new TypeError('child elements of a multipart ' +
                            ' content elements must be content elements');
                }
                let innerContentType = innerContentElement.getAttribute('type');
                if (innerContentType === null) {
                    innerContentType = 'text/plain';
                }
                let typeIndex = contentTypes.indexOf(innerContentType);
                if (typeIndex > selectedTypeIndex) {
                    selectedTypeIndex = typeIndex;
                    selectedElement = innerContentElement;
                }
            }
            if (selectedTypeIndex >= 0) {
                contentElement = selectedElement;
            }
        }

        return this.parseAtom03ContentConstruct(contentElement, false);
    }

    parseAtom03Entry(entryElement) {
        let title;
        let link;
        let date;
        let content;
        let titleElement = feedQueryXPath(this.document, entryElement,
                './atom03:title');
        if (titleElement !== null) {
            title = titleElement.textContent.trim();
        }

        let linkElement = feedQueryXPath(this.document, entryElement,
                './atom03:link[@href][@rel="alternate"]');
        if (linkElement !== null) {
            link = parseURL(linkElement.getAttribute('href'), this.url);
        }

        let modifiedElement = feedQueryXPath(this.document, entryElement,
                './atom03:modified');
        if (modifiedElement !== null) {
            date = parseDate(modifiedElement.textContent);
        }

        let contentElement = feedQueryXPath(this.document, entryElement,
                './atom03:content');
        if (contentElement !== null) {
            try {
                content = this.parseAtom03Content(contentElement);
            } catch (e) {
                if (!(e instanceof TypeError)) {
                    throw e;
                }
            }
        }
        if (typeof content === 'undefined') {
            let summaryElement = feedQueryXPath(this.document, entryElement,
                    './atom03:summary');
            if (summaryElement !== null) {
                try {
                    content = this.parseAtom03ContentConstruct(summaryElement,
                            false);
                } catch (e) {
                    if (!(e instanceof TypeError)) {
                        throw e;
                    }
                }
            }
        }

        return new FeedEntry({title, link, date, content});
    }

    parseAtom03Feed() {
        let title;
        let subtitle;
        let logo;
        let entries = [];
        let documentElement = this.document.documentElement;

        let titleElement = feedQueryXPath(this.document, documentElement,
                './atom03:title');
        if (titleElement !== null) {
            try {
                title = this.parseAtom03ContentConstruct(titleElement);
            } catch (e) {
                if (!(e instanceof TypeError)) {
                    throw e;
                }
            }
        }

        let taglineElement = feedQueryXPath(this.document, documentElement,
                './atom03:tagline');
        if (taglineElement !== null) {
            try {
                title = this.parseAtom03ContentConstruct(taglineElement);
            } catch (e) {
                if (!(e instanceof TypeError)) {
                    throw e;
                }
            }
        }

        let logoElement = feedQueryXPath(this.document, documentElement,
                './atom03:logo');
        if (logoElement !== null) {
            try {
                logo = this.parseAtomLogo(logoElement);
            } catch (e) {
                if (!(e instanceof TypeError)) {
                    throw e;
                }
            }
        }

        let entryElements = feedQueryXPathAll(this.document, documentElement,
                './atom03:entry');
        for (let entryElement of entryElements) {
            entries.push(this.parseAtom03Entry(entryElement));
        }

        return new Feed(this.url, {title, subtitle, logo, entries});
    }

    parseAtomLogo(logoElement) {
        let url = parseURL(logoElement.textContent.trim(), this.url);
        if (url === null) {
            throw new TypeError('invalid URL in <logo> element');
        }
        return new FeedLogo(url);
    }

    parseAtomTextConstruct(containerElement, textOnly = true) {
        let contentType = containerElement.getAttribute('type');
        if (contentType === null) {
            contentType = 'text';
        }

        if (contentType === 'xhtml') {
            let xhtmlRootElement = containerElement.firstElementChild;
            if (xhtmlRootElement !== null &&
                    xhtmlRootElement.localName === 'div' &&
                    xhtmlRootElement.namespaceURI === XMLNS.XHTML) {
                return textOnly ? xhtmlRootElement.textContent.trim() :
                        xhtmlRootElement.innerHTML;
            }
        } else if (contentType === 'html') {
            let htmlText = containerElement.textContent;
            if (textOnly) {
                let htmlDocument = new DOMParser().parseFromString(htmlText,
                        'text/html');
                return htmlDocument.body.textContent.trim();
            }
            return htmlText
        } else if (contentType === 'text') {
            let text = containerElement.textContent.trim();
            return textOnly ? text : `<pre>${encodeXML(text)}</pre>`;
        }

        // unsupported content type
        return;
    }

    parseAtomContent(contentElement) {
        let contentSrc = contentElement.getAttribute('src');
        if (contentSrc !== null) {
            // externally referenced content is not supported
            return;
        }
        return this.parseAtomTextConstruct(contentElement, false);
    }

    parseAtomEntryFile(enclosureElement) {
        let type;
        let size;
        let url = parseURL(enclosureElement.getAttribute('href'), this.url);
        if (url === null) {
            throw new TypeError('invalid URL in enclosure href attribute');
        }

        let typeAttribute = enclosureElement.getAttribute('type');
        if (typeAttribute !== null) {
            type = typeAttribute;
        }

        let length = parseInt(enclosureElement.getAttribute('length'), 10);
        if (!isNaN(length)) {
            size = length;
        }

        return new FeedEntryFile(url, {type, size});
    }

    parseAtomEntry(entryElement) {
        let title;
        let link;
        let date;
        let content;
        let files = [];
        let titleElement = feedQueryXPath(this.document, entryElement,
                './atom:title');
        if (titleElement !== null) {
            title = this.parseAtomTextConstruct(titleElement);
        }

        let linkElement = feedQueryXPath(this.document, entryElement,
                './atom:link[@href][not(@rel) or @rel="alternate"]');
        if (linkElement !== null) {
            link = parseURL(linkElement.getAttribute('href'), this.url);
        }

        let updatedElement = feedQueryXPath(this.document, entryElement,
                './atom:updated');
        if (updatedElement !== null) {
            date = parseDate(updatedElement.textContent);
        }

        let contentElement = feedQueryXPath(this.document, entryElement,
                './atom:content');
        if (contentElement !== null) {
            content = this.parseAtomContent(contentElement);
        }
        if (typeof content === 'undefined') {
            let summaryElement = feedQueryXPath(this.document, entryElement,
                    './atom:summary');
            if (summaryElement !== null) {
                content = this.parseAtomTextConstruct(summaryElement, false);
            }
        }

        for (let enclosureElement of feedQueryXPathAll(this.document,
                entryElement, './atom:link[@href][@rel="enclosure"]')) {
            try {
                let entryFile = this.parseAtomEntryFile(enclosureElement);
                files.push(entryFile);
            } catch (e) {
                if (!(e instanceof TypeError)) {
                    throw e;
                }
            }
        }

        return new FeedEntry({title, link, date, content, files});
    }

    parseAtomFeed() {
        let title;
        let subtitle;
        let logo;
        let entries = [];
        let documentElement = this.document.documentElement;

        let titleElement = feedQueryXPath(this.document, documentElement,
                './atom:title');
        if (titleElement !== null) {
            title = this.parseAtomTextConstruct(titleElement);
        }

        let subtitleElement = feedQueryXPath(this.document, documentElement,
                './atom:subtitle');
        if (subtitleElement !== null) {
            subtitle = this.parseAtomTextConstruct(subtitleElement);
        }

        let logoElement = feedQueryXPath(this.document, documentElement,
                './atom:logo');
        if (logoElement !== null) {
            try {
                logo = this.parseAtomLogo(logoElement);
            } catch (e) {
                if (!(e instanceof TypeError)) {
                    throw e;
                }
            }
        }

        let entryElements = feedQueryXPathAll(this.document, documentElement,
                './atom:entry');
        for (let entryElement of entryElements) {
            entries.push(this.parseAtomEntry(entryElement));
        }

        return new Feed(this.url, {title, subtitle, logo, entries});
    }

    parseRSS1Logo(imageElement, nsPrefix) {
        let title;
        let urlElement = feedQueryXPath(this.document, imageElement,
                `./${nsPrefix}:url`);
        if (urlElement === null) {
            throw new TypeError('missing <url> element in <logo> element');
        }
        let url = parseURL(urlElement.textContent.trim(), this.url);
        if (url === null) {
            throw new TypeError('invalid URL in <logo> element');
        }

        let titleElement = feedQueryXPath(this.document, imageElement,
                `./${nsPrefix}:title`);
        if (titleElement !== null) {
            title = titleElement.textContent.trim();
        }

        return new FeedLogo(url, {title});
    }

    parseRSS1Entry(itemElement, nsPrefix) {
        let title;
        let link;
        let titleElement = feedQueryXPath(this.document, itemElement,
                `./${nsPrefix}:title`);
        if (titleElement !== null) {
            title = titleElement.textContent;
        }

        let linkElement = feedQueryXPath(this.document, itemElement,
                `./${nsPrefix}:link`);
        if (linkElement !== null) {
            link = parseURL(linkElement.textContent, this.url);
        }

        return new FeedEntry({title, link});
    }

    parseRSS1Feed(version) {
        let nsPrefix = version === '0.9' ? 'rss09' : 'rss10';
        let title;
        let subtitle;
        let logo;
        let entries = [];
        let documentElement = this.document.documentElement;
        let titleElement = feedQueryXPath(this.document, documentElement,
                `./${nsPrefix}:channel/${nsPrefix}:title`);
        if (titleElement !== null) {
            title = titleElement.textContent;
        }

        let descriptionElement = feedQueryXPath(this.document, documentElement,
                `./${nsPrefix}:channel/${nsPrefix}:description`);
        if (descriptionElement !== null) {
            subtitle = descriptionElement.textContent;
        }

        let imageElement = feedQueryXPath(this.document, documentElement,
                `./${nsPrefix}:image`);
        if (imageElement !== null) {
            try {
                logo = this.parseRSS1Logo(imageElement, nsPrefix);
            } catch (e) {
                if (!(e instanceof TypeError)) {
                    throw e;
                }
            }
        }

        let itemElements = feedQueryXPathAll(this.document, documentElement,
                `./${nsPrefix}:item`);
        for (let itemElement of itemElements) {
            let entry = this.parseRSS1Entry(itemElement, nsPrefix);
            if (typeof entry !== 'undefined') {
                entries.push(entry);
            }
        }

        return new Feed(this.url, {title, subtitle, logo, entries});
    }

    parseRSS2Logo(imageElement) {
        let title;
        let urlElement = feedQueryXPath(this.document, imageElement, './url');
        if (urlElement === null) {
            throw new TypeError('missing <url> element in <logo> element');
        }
        let url = parseURL(urlElement.textContent.trim(), this.url);
        if (url === null) {
            throw new TypeError('invalid URL in <logo> element');
        }

        let titleElement = feedQueryXPath(this.document, imageElement,
                './title');
        if (titleElement !== null) {
            title = titleElement.textContent.trim();
        }

        return new FeedLogo(url, {title});
    }

    parseRSS2EntryFile(enclosureElement) {
        let type;
        let size;
        let url = parseURL(enclosureElement.getAttribute('url'), this.url);
        if (url === null) {
            throw new TypeError('invalid URL in <enclosure> element');
        }

        let typeAttribute = enclosureElement.getAttribute('type');
        if (typeAttribute !== null) {
            type = typeAttribute;
        }

        let length = parseInt(enclosureElement.getAttribute('length'),
                10);
        if (!isNaN(length)) {
            size = length;
        }

        return new FeedEntryFile(url, {type, size});
    }

    parseRSS2Entry(itemElement) {
        let title;
        let link;
        let date;
        let content;
        let files = [];
        let titleElement = feedQueryXPath(this.document, itemElement,
                './title');
        if (titleElement !== null) {
            title = titleElement.textContent;
        }

        let linkElement = feedQueryXPath(this.document, itemElement, './link');
        if (linkElement !== null) {
            link = parseURL(linkElement.textContent, this.url);
        }

        let pubDateElement = feedQueryXPath(this.document, itemElement,
                './pubDate');
        if (pubDateElement !== null) {
            date = parseDate(pubDateElement.textContent);
        }

        let descriptionElement = feedQueryXPath(this.document, itemElement,
                './description');
        if (descriptionElement !== null) {
            content = descriptionElement.textContent.trim();
        }

        for (let enclosureElement of
                feedQueryXPathAll(this.document, itemElement, './enclosure')) {
            try {
                let entryFile = this.parseRSS2EntryFile(enclosureElement);
                files.push(entryFile);
            } catch (e) {
                if (!(e instanceof TypeError)) {
                    throw e;
                }
            }
        }

        return new FeedEntry({title, link, date, content, files});
    }

    parseRSS2Feed() {
        let title;
        let subtitle;
        let logo;
        let entries = [];
        let documentElement = this.document.documentElement;
        let titleElement = feedQueryXPath(this.document, documentElement,
                './channel/title');
        if (titleElement !== null) {
            title = titleElement.textContent;
        }

        let descriptionElement = feedQueryXPath(this.document, documentElement,
                './channel/description');
        if (descriptionElement !== null) {
            subtitle = descriptionElement.textContent;
        }

        let imageElement = feedQueryXPath(this.document, documentElement,
                './channel/image');
        if (imageElement !== null) {
            try {
                logo = this.parseRSS2Logo(imageElement);
            } catch (e) {
                if (!(e instanceof TypeError)) {
                    throw e;
                }
            }
        }

        let itemElements = feedQueryXPathAll(this.document, documentElement,
                './channel/item');
        for (let itemElement of itemElements) {
            let entry = this.parseRSS2Entry(itemElement);
            if (typeof entry !== 'undefined') {
                entries.push(entry);
            }
        }

        return new Feed(this.url, {title, subtitle, logo, entries});
    }

    parseFromString(xmlString, url) {
        this.url = url;
        this.document = new DOMParser().parseFromString(xmlString,
                'application/xml');
        if (this.document.documentElement.namespaceURI === XMLNS.PARSERERROR) {
            throw new ParserError(this.document.documentElement.textContent);
        }

        let [type, version] = this.constructor.probeFeed(this.document);
        if (type === 'atom') {
            if (version === '0.3') {
                return this.parseAtom03Feed();
            } else if (version === '1.0') {
                return this.parseAtomFeed();
            }
        } else if (type === 'rss') {
            if (version === '0.9' || version === '1.0') {
                return this.parseRSS1Feed(version);
            } else {
                return this.parseRSS2Feed();
            }
        }
        throw new UnsupportedFeedTypeError();
    }
}