Mercurial > addons > firefox-addons > feed-preview
comparison js/feed-parser.js @ 14:376a0e415bba
Properly handle non-text content in Atom feed elements
The title, subtitle, summary and content elements of Atom feeds can all have
non-text content. When parsing title and subtitle elements HTML and XHTML
content will be stripped of any markup in order to keep it simple. In summary
and content elements markup will be preserved. Element content of any other
type as well as remote content in content elements will be ignored.
author | Guido Berhoerster <guido+feed-preview@berhoerster.name> |
---|---|
date | Mon, 10 Dec 2018 16:38:11 +0100 |
parents | 2bbb7617dd13 |
children | 150f07c7595f |
comparison
equal
deleted
inserted
replaced
13:799d633ccd4d | 14:376a0e415bba |
---|---|
8 | 8 |
9 'use strict'; | 9 'use strict'; |
10 | 10 |
11 export const XMLNS = { | 11 export const XMLNS = { |
12 ATOM10: 'http://www.w3.org/2005/Atom', | 12 ATOM10: 'http://www.w3.org/2005/Atom', |
13 RSS09: 'http://my.netscape.com/rdf/simple/0.9/' | 13 RSS09: 'http://my.netscape.com/rdf/simple/0.9/', |
14 XHTML: 'http://www.w3.org/1999/xhtml' | |
14 } | 15 } |
15 const ALLOWED_LINK_PROTOCOLS = new Set(['http:', 'https:', 'ftp:']); | 16 const ALLOWED_LINK_PROTOCOLS = new Set(['http:', 'https:', 'ftp:']); |
16 | 17 |
17 function encodeXML(str) { | 18 function encodeXML(str) { |
18 return str.replace(/[<>&'"]/g, c => { | 19 return str.replace(/[<>&'"]/g, c => { |
214 throw new TypeError('invalid URL in <logo> element'); | 215 throw new TypeError('invalid URL in <logo> element'); |
215 } | 216 } |
216 return new FeedLogo(url); | 217 return new FeedLogo(url); |
217 } | 218 } |
218 | 219 |
220 parseAtomTextConstruct(containerElement, textOnly = true) { | |
221 let contentType = containerElement.getAttribute('type'); | |
222 if (contentType === null) { | |
223 contentType = 'text'; | |
224 } | |
225 | |
226 if (contentType === 'xhtml') { | |
227 let xhtmlRootElement = containerElement.firstElementChild; | |
228 if (xhtmlRootElement !== null && | |
229 xhtmlRootElement.localName === 'div' && | |
230 xhtmlRootElement.namespaceURI === XMLNS.XHTML) { | |
231 return textOnly ? xhtmlRootElement.textContent.trim() : | |
232 xhtmlRootElement.innerHTML; | |
233 } | |
234 } else if (contentType === 'html') { | |
235 let htmlText = containerElement.textContent; | |
236 if (textOnly) { | |
237 let htmlDocument = new DOMParser().parseFromString(htmlText, | |
238 'text/html'); | |
239 return htmlDocument.body.textContent.trim(); | |
240 } | |
241 return htmlText | |
242 } else if (contentType === 'text') { | |
243 let text = containerElement.textContent.trim(); | |
244 return textOnly ? text : `<pre>${encodeXML(text)}</pre>`; | |
245 } | |
246 | |
247 // unsupported content type | |
248 return; | |
249 } | |
250 | |
251 parseAtomContent(contentElement) { | |
252 let contentSrc = contentElement.getAttribute('src'); | |
253 if (contentSrc !== null) { | |
254 // externally referenced content is not supported | |
255 return; | |
256 } | |
257 return this.parseAtomTextConstruct(contentElement, false); | |
258 } | |
259 | |
219 parseAtomEntry(entryElement) { | 260 parseAtomEntry(entryElement) { |
220 let title; | 261 let title; |
221 let link; | 262 let link; |
222 let date; | 263 let date; |
223 let content; | 264 let content; |
224 let titleElement = feedQueryXPath(this.document, entryElement, | 265 let titleElement = feedQueryXPath(this.document, entryElement, |
225 './atom:title'); | 266 './atom:title'); |
226 if (titleElement !== null) { | 267 if (titleElement !== null) { |
227 title = titleElement.textContent.trim(); | 268 title = this.parseAtomTextConstruct(titleElement); |
228 } | 269 } |
229 | 270 |
230 let linkElement = feedQueryXPath(this.document, entryElement, | 271 let linkElement = feedQueryXPath(this.document, entryElement, |
231 './atom:link[@href][not(@rel) or @rel="alternate"]'); | 272 './atom:link[@href][not(@rel) or @rel="alternate"]'); |
232 if (linkElement !== null) { | 273 if (linkElement !== null) { |
239 date = parseDate(updatedElement.textContent); | 280 date = parseDate(updatedElement.textContent); |
240 } | 281 } |
241 | 282 |
242 let contentElement = feedQueryXPath(this.document, entryElement, | 283 let contentElement = feedQueryXPath(this.document, entryElement, |
243 './atom:content'); | 284 './atom:content'); |
244 if (contentElement === null) { | 285 if (contentElement !== null) { |
245 contentElement = feedQueryXPath(this.document, entryElement, | 286 content = this.parseAtomContent(contentElement); |
287 } | |
288 if (typeof content === 'undefined') { | |
289 let summaryElement = feedQueryXPath(this.document, entryElement, | |
246 './atom:summary'); | 290 './atom:summary'); |
247 } | 291 if (summaryElement !== null) { |
248 if (contentElement !== null) { | 292 content = this.parseAtomTextConstruct(summaryElement, false); |
249 let contentType = contentElement.getAttribute('type'); | |
250 if (contentType === null) { | |
251 contentType = 'text'; | |
252 } | |
253 contentType = contentType.toLowerCase(); | |
254 if (contentType === 'xhtml') { | |
255 content = contentElement.innerHTML; | |
256 } else if (contentType === 'html') { | |
257 content = contentElement.textContent; | |
258 } else { | |
259 let encodedContent = | |
260 encodeXML(contentElement.textContent.trim()); | |
261 content = `<pre>${encodedContent}</pre>`; | |
262 } | 293 } |
263 } | 294 } |
264 | 295 |
265 return new FeedEntry({title, link, date, content}); | 296 return new FeedEntry({title, link, date, content}); |
266 } | 297 } |
273 let documentElement = this.document.documentElement; | 304 let documentElement = this.document.documentElement; |
274 | 305 |
275 let titleElement = feedQueryXPath(this.document, documentElement, | 306 let titleElement = feedQueryXPath(this.document, documentElement, |
276 './atom:title'); | 307 './atom:title'); |
277 if (titleElement !== null) { | 308 if (titleElement !== null) { |
278 title = titleElement.textContent.trim(); | 309 title = this.parseAtomTextConstruct(titleElement); |
279 } | 310 } |
280 | 311 |
281 let subtitleElement = feedQueryXPath(this.document, documentElement, | 312 let subtitleElement = feedQueryXPath(this.document, documentElement, |
282 './atom:subtitle'); | 313 './atom:subtitle'); |
283 if (subtitleElement !== null) { | 314 if (subtitleElement !== null) { |
284 subtitle = subtitleElement.textContent.trim(); | 315 subtitle = this.parseAtomTextConstruct(subtitleElement); |
285 } | 316 } |
286 | 317 |
287 let logoElement = feedQueryXPath(this.document, documentElement, | 318 let logoElement = feedQueryXPath(this.document, documentElement, |
288 './atom:logo'); | 319 './atom:logo'); |
289 if (logoElement !== null) { | 320 if (logoElement !== null) { |