comparison js/feed-parser.js @ 14:376a0e415bba

Properly handle non-text content in Atom feed elements The title, subtitle, summary and content elements of Atom feeds can all have non-text content. When parsing title and subtitle elements HTML and XHTML content will be stripped of any markup in order to keep it simple. In summary and content elements markup will be preserved. Element content of any other type as well as remote content in content elements will be ignored.
author Guido Berhoerster <guido+feed-preview@berhoerster.name>
date Mon, 10 Dec 2018 16:38:11 +0100
parents 2bbb7617dd13
children 150f07c7595f
comparison
equal deleted inserted replaced
13:799d633ccd4d 14:376a0e415bba
8 8
9 'use strict'; 9 'use strict';
10 10
11 export const XMLNS = { 11 export const XMLNS = {
12 ATOM10: 'http://www.w3.org/2005/Atom', 12 ATOM10: 'http://www.w3.org/2005/Atom',
13 RSS09: 'http://my.netscape.com/rdf/simple/0.9/' 13 RSS09: 'http://my.netscape.com/rdf/simple/0.9/',
14 XHTML: 'http://www.w3.org/1999/xhtml'
14 } 15 }
15 const ALLOWED_LINK_PROTOCOLS = new Set(['http:', 'https:', 'ftp:']); 16 const ALLOWED_LINK_PROTOCOLS = new Set(['http:', 'https:', 'ftp:']);
16 17
17 function encodeXML(str) { 18 function encodeXML(str) {
18 return str.replace(/[<>&'"]/g, c => { 19 return str.replace(/[<>&'"]/g, c => {
214 throw new TypeError('invalid URL in <logo> element'); 215 throw new TypeError('invalid URL in <logo> element');
215 } 216 }
216 return new FeedLogo(url); 217 return new FeedLogo(url);
217 } 218 }
218 219
220 parseAtomTextConstruct(containerElement, textOnly = true) {
221 let contentType = containerElement.getAttribute('type');
222 if (contentType === null) {
223 contentType = 'text';
224 }
225
226 if (contentType === 'xhtml') {
227 let xhtmlRootElement = containerElement.firstElementChild;
228 if (xhtmlRootElement !== null &&
229 xhtmlRootElement.localName === 'div' &&
230 xhtmlRootElement.namespaceURI === XMLNS.XHTML) {
231 return textOnly ? xhtmlRootElement.textContent.trim() :
232 xhtmlRootElement.innerHTML;
233 }
234 } else if (contentType === 'html') {
235 let htmlText = containerElement.textContent;
236 if (textOnly) {
237 let htmlDocument = new DOMParser().parseFromString(htmlText,
238 'text/html');
239 return htmlDocument.body.textContent.trim();
240 }
241 return htmlText
242 } else if (contentType === 'text') {
243 let text = containerElement.textContent.trim();
244 return textOnly ? text : `<pre>${encodeXML(text)}</pre>`;
245 }
246
247 // unsupported content type
248 return;
249 }
250
251 parseAtomContent(contentElement) {
252 let contentSrc = contentElement.getAttribute('src');
253 if (contentSrc !== null) {
254 // externally referenced content is not supported
255 return;
256 }
257 return this.parseAtomTextConstruct(contentElement, false);
258 }
259
219 parseAtomEntry(entryElement) { 260 parseAtomEntry(entryElement) {
220 let title; 261 let title;
221 let link; 262 let link;
222 let date; 263 let date;
223 let content; 264 let content;
224 let titleElement = feedQueryXPath(this.document, entryElement, 265 let titleElement = feedQueryXPath(this.document, entryElement,
225 './atom:title'); 266 './atom:title');
226 if (titleElement !== null) { 267 if (titleElement !== null) {
227 title = titleElement.textContent.trim(); 268 title = this.parseAtomTextConstruct(titleElement);
228 } 269 }
229 270
230 let linkElement = feedQueryXPath(this.document, entryElement, 271 let linkElement = feedQueryXPath(this.document, entryElement,
231 './atom:link[@href][not(@rel) or @rel="alternate"]'); 272 './atom:link[@href][not(@rel) or @rel="alternate"]');
232 if (linkElement !== null) { 273 if (linkElement !== null) {
239 date = parseDate(updatedElement.textContent); 280 date = parseDate(updatedElement.textContent);
240 } 281 }
241 282
242 let contentElement = feedQueryXPath(this.document, entryElement, 283 let contentElement = feedQueryXPath(this.document, entryElement,
243 './atom:content'); 284 './atom:content');
244 if (contentElement === null) { 285 if (contentElement !== null) {
245 contentElement = feedQueryXPath(this.document, entryElement, 286 content = this.parseAtomContent(contentElement);
287 }
288 if (typeof content === 'undefined') {
289 let summaryElement = feedQueryXPath(this.document, entryElement,
246 './atom:summary'); 290 './atom:summary');
247 } 291 if (summaryElement !== null) {
248 if (contentElement !== null) { 292 content = this.parseAtomTextConstruct(summaryElement, false);
249 let contentType = contentElement.getAttribute('type');
250 if (contentType === null) {
251 contentType = 'text';
252 }
253 contentType = contentType.toLowerCase();
254 if (contentType === 'xhtml') {
255 content = contentElement.innerHTML;
256 } else if (contentType === 'html') {
257 content = contentElement.textContent;
258 } else {
259 let encodedContent =
260 encodeXML(contentElement.textContent.trim());
261 content = `<pre>${encodedContent}</pre>`;
262 } 293 }
263 } 294 }
264 295
265 return new FeedEntry({title, link, date, content}); 296 return new FeedEntry({title, link, date, content});
266 } 297 }
273 let documentElement = this.document.documentElement; 304 let documentElement = this.document.documentElement;
274 305
275 let titleElement = feedQueryXPath(this.document, documentElement, 306 let titleElement = feedQueryXPath(this.document, documentElement,
276 './atom:title'); 307 './atom:title');
277 if (titleElement !== null) { 308 if (titleElement !== null) {
278 title = titleElement.textContent.trim(); 309 title = this.parseAtomTextConstruct(titleElement);
279 } 310 }
280 311
281 let subtitleElement = feedQueryXPath(this.document, documentElement, 312 let subtitleElement = feedQueryXPath(this.document, documentElement,
282 './atom:subtitle'); 313 './atom:subtitle');
283 if (subtitleElement !== null) { 314 if (subtitleElement !== null) {
284 subtitle = subtitleElement.textContent.trim(); 315 subtitle = this.parseAtomTextConstruct(subtitleElement);
285 } 316 }
286 317
287 let logoElement = feedQueryXPath(this.document, documentElement, 318 let logoElement = feedQueryXPath(this.document, documentElement,
288 './atom:logo'); 319 './atom:logo');
289 if (logoElement !== null) { 320 if (logoElement !== null) {