addons/firefox-addons/feed-preview

changeset 14:376a0e415bba

Properly handle non-text content in Atom feed elements

The title, subtitle, summary and content elements of Atom feeds can all have
non-text content. When parsing title and subtitle elements HTML and XHTML
content will be stripped of any markup in order to keep it simple. In summary
and content elements markup will be preserved. Element content of any other
type as well as remote content in content elements will be ignored.
author Guido Berhoerster <guido+feed-preview@berhoerster.name>
date Mon Dec 10 16:38:11 2018 +0100 (18 months ago)
parents 799d633ccd4d
children 150f07c7595f
files js/background.js js/feed-parser.js
line diff
     1.1 --- a/js/background.js	Sat Dec 08 12:12:18 2018 +0100
     1.2 +++ b/js/background.js	Mon Dec 10 16:38:11 2018 +0100
     1.3 @@ -48,7 +48,10 @@
     1.4  const FEED_MAGIC = [
     1.5      '<rss',
     1.6      '<feed',
     1.7 -    ...Object.values(feedParser.XMLNS)
     1.8 +    feedParser.XMLNS.ATOM03,
     1.9 +    feedParser.XMLNS.ATOM10,
    1.10 +    feedParser.XMLNS.RSS09,
    1.11 +    feedParser.XMLNS.RSS10
    1.12  ];
    1.13  var tabsFeeds = new Map();
    1.14  var tabsFeedPreviews = new Map();
     2.1 --- a/js/feed-parser.js	Sat Dec 08 12:12:18 2018 +0100
     2.2 +++ b/js/feed-parser.js	Mon Dec 10 16:38:11 2018 +0100
     2.3 @@ -10,7 +10,8 @@
     2.4  
     2.5  export const XMLNS = {
     2.6      ATOM10: 'http://www.w3.org/2005/Atom',
     2.7 -    RSS09: 'http://my.netscape.com/rdf/simple/0.9/'
     2.8 +    RSS09: 'http://my.netscape.com/rdf/simple/0.9/',
     2.9 +    XHTML: 'http://www.w3.org/1999/xhtml'
    2.10  }
    2.11  const ALLOWED_LINK_PROTOCOLS = new Set(['http:', 'https:', 'ftp:']);
    2.12  
    2.13 @@ -216,6 +217,46 @@
    2.14          return new FeedLogo(url);
    2.15      }
    2.16  
    2.17 +    parseAtomTextConstruct(containerElement, textOnly = true) {
    2.18 +        let contentType = containerElement.getAttribute('type');
    2.19 +        if (contentType === null) {
    2.20 +            contentType = 'text';
    2.21 +        }
    2.22 +
    2.23 +        if (contentType === 'xhtml') {
    2.24 +            let xhtmlRootElement = containerElement.firstElementChild;
    2.25 +            if (xhtmlRootElement !== null &&
    2.26 +                    xhtmlRootElement.localName === 'div' &&
    2.27 +                    xhtmlRootElement.namespaceURI === XMLNS.XHTML) {
    2.28 +                return textOnly ? xhtmlRootElement.textContent.trim() :
    2.29 +                        xhtmlRootElement.innerHTML;
    2.30 +            }
    2.31 +        } else if (contentType === 'html') {
    2.32 +            let htmlText = containerElement.textContent;
    2.33 +            if (textOnly) {
    2.34 +                let htmlDocument = new DOMParser().parseFromString(htmlText,
    2.35 +                        'text/html');
    2.36 +                return htmlDocument.body.textContent.trim();
    2.37 +            }
    2.38 +            return htmlText
    2.39 +        } else if (contentType === 'text') {
    2.40 +            let text = containerElement.textContent.trim();
    2.41 +            return textOnly ? text : `<pre>${encodeXML(text)}</pre>`;
    2.42 +        }
    2.43 +
    2.44 +        // unsupported content type
    2.45 +        return;
    2.46 +    }
    2.47 +
    2.48 +    parseAtomContent(contentElement) {
    2.49 +        let contentSrc = contentElement.getAttribute('src');
    2.50 +        if (contentSrc !== null) {
    2.51 +            // externally referenced content is not supported
    2.52 +            return;
    2.53 +        }
    2.54 +        return this.parseAtomTextConstruct(contentElement, false);
    2.55 +    }
    2.56 +
    2.57      parseAtomEntry(entryElement) {
    2.58          let title;
    2.59          let link;
    2.60 @@ -224,7 +265,7 @@
    2.61          let titleElement = feedQueryXPath(this.document, entryElement,
    2.62                  './atom:title');
    2.63          if (titleElement !== null) {
    2.64 -            title = titleElement.textContent.trim();
    2.65 +            title = this.parseAtomTextConstruct(titleElement);
    2.66          }
    2.67  
    2.68          let linkElement = feedQueryXPath(this.document, entryElement,
    2.69 @@ -241,24 +282,14 @@
    2.70  
    2.71          let contentElement = feedQueryXPath(this.document, entryElement,
    2.72                  './atom:content');
    2.73 -        if (contentElement === null) {
    2.74 -            contentElement = feedQueryXPath(this.document, entryElement,
    2.75 +        if (contentElement !== null) {
    2.76 +            content = this.parseAtomContent(contentElement);
    2.77 +        }
    2.78 +        if (typeof content === 'undefined') {
    2.79 +            let summaryElement = feedQueryXPath(this.document, entryElement,
    2.80                      './atom:summary');
    2.81 -        }
    2.82 -        if (contentElement !== null) {
    2.83 -            let contentType = contentElement.getAttribute('type');
    2.84 -            if (contentType === null) {
    2.85 -                contentType = 'text';
    2.86 -            }
    2.87 -            contentType = contentType.toLowerCase();
    2.88 -            if (contentType === 'xhtml') {
    2.89 -                content = contentElement.innerHTML;
    2.90 -            } else if (contentType === 'html') {
    2.91 -                content = contentElement.textContent;
    2.92 -            } else {
    2.93 -                let encodedContent =
    2.94 -                        encodeXML(contentElement.textContent.trim());
    2.95 -                content = `<pre>${encodedContent}</pre>`;
    2.96 +            if (summaryElement !== null) {
    2.97 +                content = this.parseAtomTextConstruct(summaryElement, false);
    2.98              }
    2.99          }
   2.100  
   2.101 @@ -275,13 +306,13 @@
   2.102          let titleElement = feedQueryXPath(this.document, documentElement,
   2.103                  './atom:title');
   2.104          if (titleElement !== null) {
   2.105 -            title = titleElement.textContent.trim();
   2.106 +            title = this.parseAtomTextConstruct(titleElement);
   2.107          }
   2.108  
   2.109          let subtitleElement = feedQueryXPath(this.document, documentElement,
   2.110                  './atom:subtitle');
   2.111          if (subtitleElement !== null) {
   2.112 -            subtitle = subtitleElement.textContent.trim();
   2.113 +            subtitle = this.parseAtomTextConstruct(subtitleElement);
   2.114          }
   2.115  
   2.116          let logoElement = feedQueryXPath(this.document, documentElement,