view js/feed-parser.js @ 12:1e9aa12d7306 version-2

Release version 2
author Guido Berhoerster <guido+feed-preview@berhoerster.name>
date Sat, 08 Dec 2018 12:10:03 +0100
parents 2bbb7617dd13
children 376a0e415bba
line wrap: on
line source

/*
 * Copyright (C) 2018 Guido Berhoerster <guido+feed-preview@berhoerster.name>
 *
 * This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 */

'use strict';

export const XMLNS = {
    ATOM10: 'http://www.w3.org/2005/Atom',
    RSS09: 'http://my.netscape.com/rdf/simple/0.9/'
}
const ALLOWED_LINK_PROTOCOLS = new Set(['http:', 'https:', 'ftp:']);

function encodeXML(str) {
    return str.replace(/[<>&'"]/g, c => {
        switch (c) {
            case '<': return '&lt;';
            case '>': return '&gt;';
            case '&': return '&amp;';
            case '\'': return '&apos;';
            case '"': return '&quot;';
        }
    });
}

function parseDate(s) {
    let date = new Date(s);

    return isNaN(date) ? new Date(0) : date;
}

function parseURL(text, baseURL = '') {
    let url;

    try {
        url = new URL(text, baseURL);
    } catch (e) {
        return null;
    }
    if (!ALLOWED_LINK_PROTOCOLS.has(url.protocol)) {
        return null;
    }

    return url;
}

function feedNSResolver(prefix) {
    switch (prefix) {
        case 'atom':
            return XMLNS.ATOM10;
        case 'rss':
            return XMLNS.RSS09;
    }
    return null;
}

function feedQueryXPath(feedDocument, scopeElement, xpathQuery) {
    return feedDocument.evaluate(xpathQuery, scopeElement, feedNSResolver,
            XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;
}

function feedQueryXPathAll(feedDocument, scopeElement, xpathQuery) {
    let result = feedDocument.evaluate(xpathQuery, scopeElement, feedNSResolver,
            XPathResult.ORDERED_NODE_ITERATOR_TYPE, null);
    let nodes = [];
    for (let node = result.iterateNext(); node !== null;
            node = result.iterateNext()) {
        nodes.push(node);
    }

    return nodes;
}

export class ParserError extends Error {
    constructor() {
        super(...arguments);
        this.name = this.constructor.name;
    }
}

export class UnsupportedFeedTypeError extends Error {
    constructor(message = 'Document is not a supported feed', ...params) {
        super(message, ...params);
        this.name = this.constructor.name;
    }
}

export class ProtocolError extends Error {
    constructor(url, status, statusText, ...params) {
        let message = `Protocol error: Transfer of ${url} failed with: ` +
                `${status} ${statusText}`
        super(message, ...params);
        this.name = this.constructor.name;
        this.url = url;
        this.status = status;
        this.statusText = statusText;
    }
}

class FeedLogo {
    constructor(url, {title = ''} = {}) {
        this.url = url;
        this.title = title;
    }
}

class FeedEntryFile {
    constructor(url, {type = browser.i18n.getMessage('defaultFileType'),
            size = 0} = {}) {
        this.filename = undefined;
        this._url = undefined;
        this.url = url;
        this.type = type;
        this.size = size;
    }

    set url(url) {
        this._url = url;
        let filename = url.pathname.split('/').pop();
        this.filename = filename !== '' ? filename :
                browser.i18n.getMessage('defaultFileName');
    }

    get url() {
        return this._url;
    }
}

class FeedEntry {
    constructor({title = browser.i18n.getMessage('defaultFeedEntryTitle'),
            link = undefined, date = new Date(0), content = '',
            files = []} = {}) {
        this.title = title;
        this.link = link;
        this.date = date;
        this._content = undefined;
        this.content = content;
        this.files = files;
    }

    normalizeContent(text) {
        if (typeof text === 'undefined') {
            return
        }

        let contentDocument = document.implementation.createHTMLDocument();
        let parsedDocument = new DOMParser().parseFromString(text, 'text/html');
        contentDocument.body = contentDocument.adoptNode(parsedDocument.body);
        return new XMLSerializer().serializeToString(contentDocument);
    }

    set content(content) {
        this._content = this.normalizeContent(content);
    }

    get content() {
        return this._content;
    }
}

class Feed {
    constructor(url, {title = browser.i18n.getMessage('defaultFeedTitle'),
            subtitle = '', logo, entries = []} = {}) {
        this.url = url;
        this.title = title;
        this.subtitle = subtitle;
        this.logo = logo;
        this.entries = entries;
    }
}

export class FeedParser {
    static probeFeed(feedDocument) {
        let documentElement = feedDocument.documentElement;
        if (documentElement.nodeName === 'feed' &&
                documentElement.namespaceURI === XMLNS.ATOM10) {
            let version = documentElement.getAttribute('version');
            if (version === null) {
                version = '1.0';
            }
            if (version === '1.0') {
                return ['atom', version];
            }
        } else if (documentElement.nodeName === 'rss') {
            let version = documentElement.getAttribute('version');
            switch (version) {
                case '0.90':
                case '0.91':
                case '0.92':
                case '0.93':
                case '0.94':
                case '2.0':
                    return ['rss', version];
            }
        } else if (documentElement.localName.toLowerCase() === 'rdf' &&
                documentElement.getAttribute('xmlns') === XMLNS.RSS09) {
            return ['rss', '0.9'];
        }

        return [undefined, undefined];
    }

    constructor() {
        this.url = undefined;
        this.document = undefined;
    }

    parseAtomLogo(logoElement) {
        let url = parseURL(logoElement.textContent.trim(), this.url);
        if (url === null) {
            throw new TypeError('invalid URL in <logo> element');
        }
        return new FeedLogo(url);
    }

    parseAtomEntry(entryElement) {
        let title;
        let link;
        let date;
        let content;
        let titleElement = feedQueryXPath(this.document, entryElement,
                './atom:title');
        if (titleElement !== null) {
            title = titleElement.textContent.trim();
        }

        let linkElement = feedQueryXPath(this.document, entryElement,
                './atom:link[@href][not(@rel) or @rel="alternate"]');
        if (linkElement !== null) {
            link = parseURL(linkElement.getAttribute('href'), this.url);
        }

        let updatedElement = feedQueryXPath(this.document, entryElement,
                './atom:updated');
        if (updatedElement !== null) {
            date = parseDate(updatedElement.textContent);
        }

        let contentElement = feedQueryXPath(this.document, entryElement,
                './atom:content');
        if (contentElement === null) {
            contentElement = feedQueryXPath(this.document, entryElement,
                    './atom:summary');
        }
        if (contentElement !== null) {
            let contentType = contentElement.getAttribute('type');
            if (contentType === null) {
                contentType = 'text';
            }
            contentType = contentType.toLowerCase();
            if (contentType === 'xhtml') {
                content = contentElement.innerHTML;
            } else if (contentType === 'html') {
                content = contentElement.textContent;
            } else {
                let encodedContent =
                        encodeXML(contentElement.textContent.trim());
                content = `<pre>${encodedContent}</pre>`;
            }
        }

        return new FeedEntry({title, link, date, content});
    }

    parseAtomFeed() {
        let title;
        let subtitle;
        let logo;
        let entries = [];
        let documentElement = this.document.documentElement;

        let titleElement = feedQueryXPath(this.document, documentElement,
                './atom:title');
        if (titleElement !== null) {
            title = titleElement.textContent.trim();
        }

        let subtitleElement = feedQueryXPath(this.document, documentElement,
                './atom:subtitle');
        if (subtitleElement !== null) {
            subtitle = subtitleElement.textContent.trim();
        }

        let logoElement = feedQueryXPath(this.document, documentElement,
                './atom:logo');
        if (logoElement !== null) {
            try {
                logo = this.parseAtomLogo(logoElement);
            } catch (e) {
                if (!(e instanceof TypeError)) {
                    throw e;
                }
            }
        }

        let entryElements = feedQueryXPathAll(this.document, documentElement,
                './atom:entry');
        for (let entryElement of entryElements) {
            entries.push(this.parseAtomEntry(entryElement));
        }

        return new Feed(this.url, {title, subtitle, logo, entries});
    }

    parseRSS1Logo(imageElement) {
        let title;
        let urlElement = feedQueryXPath(this.document, imageElement,
                './rss:url');
        if (urlElement === null) {
            throw new TypeError('missing <url> element in <logo> element');
        }
        let url = parseURL(urlElement.textContent.trim(), this.url);
        if (url === null) {
            throw new TypeError('invalid URL in <logo> element');
        }

        let titleElement = feedQueryXPath(this.document, imageElement,
                './rss:title');
        if (titleElement !== null) {
            title = titleElement.textContent.trim();
        }

        return new FeedLogo(url, {title});
    }

    parseRSS1Entry(itemElement) {
        let title;
        let link;
        let titleElement = feedQueryXPath(this.document, itemElement,
                './rss:title');
        if (titleElement !== null) {
            title = titleElement.textContent;
        }

        let linkElement = feedQueryXPath(this.document, itemElement,
                './rss:link');
        if (linkElement !== null) {
            link = parseURL(linkElement.textContent, this.url);
        }

        return new FeedEntry({title, link});
    }

    parseRSS1Feed() {
        let title;
        let subtitle;
        let logo;
        let entries = [];
        let documentElement = this.document.documentElement;
        let titleElement = feedQueryXPath(this.document, documentElement,
                './rss:channel/rss:title');
        if (titleElement !== null) {
            title = titleElement.textContent;
        }

        let descriptionElement = feedQueryXPath(this.document, documentElement,
                './channel/description');
        if (descriptionElement !== null) {
            subtitle = descriptionElement.textContent;
        }

        let imageElement = feedQueryXPath(this.document, documentElement,
                './rss:image');
        if (imageElement !== null) {
            try {
                logo = this.parseRSS1Logo(imageElement);
            } catch (e) {
                if (!(e instanceof TypeError)) {
                    throw e;
                }
            }
        }

        let itemElements = feedQueryXPathAll(this.document, documentElement,
                './rss:item');
        for (let itemElement of itemElements) {
            let entry = this.parseRSS1Entry(itemElement);
            if (typeof entry !== 'undefined') {
                entries.push(entry);
            }
        }

        return new Feed(this.url, {title, subtitle, logo, entries});
    }

    parseRSS2Logo(imageElement) {
        let title;
        let urlElement = feedQueryXPath(this.document, imageElement, './url');
        if (urlElement === null) {
            throw new TypeError('missing <url> element in <logo> element');
        }
        let url = parseURL(urlElement.textContent.trim(), this.url);
        if (url === null) {
            throw new TypeError('invalid URL in <logo> element');
        }

        let titleElement = feedQueryXPath(this.document, imageElement,
                './title');
        if (titleElement !== null) {
            title = titleElement.textContent.trim();
        }

        return new FeedLogo(url, {title});
    }

    parseRSS2EntryFile(enclosureElement) {
        let type;
        let size;
        let url = parseURL(enclosureElement.getAttribute('url'), this.url);
        if (url === null) {
            throw new TypeError('invalid URL in <enclosure> element');
        }

        let typeAttribute = enclosureElement.getAttribute('type');
        if (typeAttribute !== null) {
            type = typeAttribute;
        }

        let length = parseInt(enclosureElement.getAttribute('length'),
                10);
        if (!isNaN(length)) {
            size = length;
        }

        return new FeedEntryFile(url, {type, size});
    }

    parseRSS2Entry(itemElement) {
        let title;
        let link;
        let date;
        let content;
        let files = [];
        let titleElement = feedQueryXPath(this.document, itemElement,
                './title');
        if (titleElement !== null) {
            title = titleElement.textContent;
        }

        let linkElement = feedQueryXPath(this.document, itemElement, './link');
        if (linkElement !== null) {
            link = parseURL(linkElement.textContent, this.url);
        }

        let pubDateElement = feedQueryXPath(this.document, itemElement,
                './pubDate');
        if (pubDateElement !== null) {
            date = parseDate(pubDateElement.textContent);
        }

        let descriptionElement = feedQueryXPath(this.document, itemElement,
                './description');
        if (descriptionElement !== null) {
            content = descriptionElement.textContent.trim();
        }

        for (let enclosureElement of
                feedQueryXPathAll(this.document, itemElement, './enclosure')) {
            try {
                let entryFile = this.parseRSS2EntryFile(enclosureElement);
                files.push(entryFile);
            } catch (e) {
                if (!(e instanceof TypeError)) {
                    throw e;
                }
            }
        }

        return new FeedEntry({title, link, date, content, files});
    }

    parseRSS2Feed() {
        let title;
        let subtitle;
        let logo;
        let entries = [];
        let documentElement = this.document.documentElement;
        let titleElement = feedQueryXPath(this.document, documentElement,
                './channel/title');
        if (titleElement !== null) {
            title = titleElement.textContent;
        }

        let descriptionElement = feedQueryXPath(this.document, documentElement,
                './channel/description');
        if (descriptionElement !== null) {
            subtitle = descriptionElement.textContent;
        }

        let imageElement = feedQueryXPath(this.document, documentElement,
                './channel/image');
        if (imageElement !== null) {
            try {
                logo = this.parseRSS2Logo(imageElement);
            } catch (e) {
                if (!(e instanceof TypeError)) {
                    throw e;
                }
            }
        }

        let itemElements = feedQueryXPathAll(this.document, documentElement,
                './channel/item');
        for (let itemElement of itemElements) {
            let entry = this.parseRSS2Entry(itemElement);
            if (typeof entry !== 'undefined') {
                entries.push(entry);
            }
        }

        return new Feed(this.url, {title, subtitle, logo, entries});
    }

    parseFromString(xmlString, url) {
        this.url = url;
        this.document = new DOMParser().parseFromString(xmlString,
                'application/xml');
        if (this.document.documentElement.nodeName.toLowerCase() ===
                'parsererror') {
            throw new ParserError(this.document.documentElement.textContent);
        }

        let [type, version] = this.constructor.probeFeed(this.document);
        if (type === 'atom') {
            return this.parseAtomFeed();
        } else if (type === 'rss') {
            if (version === '0.9') {
                return this.parseRSS1Feed();
            } else {
                return this.parseRSS2Feed();
            }
        }
        throw new UnsupportedFeedTypeError();
    }
}