comparison js/feed-parser.js @ 15:150f07c7595f

Add support for Atom 0.3 feeds
author Guido Berhoerster <guido+feed-preview@berhoerster.name>
date Mon, 10 Dec 2018 23:24:36 +0100
parents 376a0e415bba
children 48cabd01ef64
comparison
equal deleted inserted replaced
14:376a0e415bba 15:150f07c7595f
7 */ 7 */
8 8
9 'use strict'; 9 'use strict';
10 10
11 export const XMLNS = { 11 export const XMLNS = {
12 ATOM03: 'http://purl.org/atom/ns#',
12 ATOM10: 'http://www.w3.org/2005/Atom', 13 ATOM10: 'http://www.w3.org/2005/Atom',
13 RSS09: 'http://my.netscape.com/rdf/simple/0.9/', 14 RSS09: 'http://my.netscape.com/rdf/simple/0.9/',
14 XHTML: 'http://www.w3.org/1999/xhtml' 15 XHTML: 'http://www.w3.org/1999/xhtml',
16 PARSERERROR: 'http://www.mozilla.org/newlayout/xml/parsererror.xml'
15 } 17 }
16 const ALLOWED_LINK_PROTOCOLS = new Set(['http:', 'https:', 'ftp:']); 18 const ALLOWED_LINK_PROTOCOLS = new Set(['http:', 'https:', 'ftp:']);
17 19
18 function encodeXML(str) { 20 function encodeXML(str) {
19 return str.replace(/[<>&'"]/g, c => { 21 return str.replace(/[<>&'"]/g, c => {
46 } 48 }
47 49
48 return url; 50 return url;
49 } 51 }
50 52
53 function base64Decode(base64Str) {
54 let encodedText;
55 try {
56 encodedText = atob(base64Str);
57 } catch (e) {
58 throw (e instanceof DOMException) ? new TypeError(e.message) : e;
59 }
60 let byteBuffer = new Uint8Array(new ArrayBuffer(encodedText.length));
61 for (let i = 0; i < encodedText.length; i++) {
62 byteBuffer[i] = encodedText.charCodeAt(i);
63 }
64 return new TextDecoder().decode(byteBuffer);
65 }
66
51 function feedNSResolver(prefix) { 67 function feedNSResolver(prefix) {
52 switch (prefix) { 68 switch (prefix) {
69 case 'atom03':
70 return XMLNS.ATOM03;
53 case 'atom': 71 case 'atom':
54 return XMLNS.ATOM10; 72 return XMLNS.ATOM10;
55 case 'rss': 73 case 'rss':
56 return XMLNS.RSS09; 74 return XMLNS.RSS09;
57 } 75 }
175 193
176 export class FeedParser { 194 export class FeedParser {
177 static probeFeed(feedDocument) { 195 static probeFeed(feedDocument) {
178 let documentElement = feedDocument.documentElement; 196 let documentElement = feedDocument.documentElement;
179 if (documentElement.nodeName === 'feed' && 197 if (documentElement.nodeName === 'feed' &&
198 documentElement.namespaceURI === XMLNS.ATOM03) {
199 return ['atom', '0.3'];
200 } else if (documentElement.nodeName === 'feed' &&
180 documentElement.namespaceURI === XMLNS.ATOM10) { 201 documentElement.namespaceURI === XMLNS.ATOM10) {
181 let version = documentElement.getAttribute('version'); 202 return ['atom', '1.0'];
182 if (version === null) {
183 version = '1.0';
184 }
185 if (version === '1.0') {
186 return ['atom', version];
187 }
188 } else if (documentElement.nodeName === 'rss') { 203 } else if (documentElement.nodeName === 'rss') {
189 let version = documentElement.getAttribute('version'); 204 let version = documentElement.getAttribute('version');
190 switch (version) { 205 switch (version) {
191 case '0.90': 206 case '0.90':
192 case '0.91': 207 case '0.91':
207 constructor() { 222 constructor() {
208 this.url = undefined; 223 this.url = undefined;
209 this.document = undefined; 224 this.document = undefined;
210 } 225 }
211 226
227 parseAtom03ContentConstruct(containerElement, textOnly = true) {
228 let contentType = containerElement.getAttribute('type');
229 let contentMode = containerElement.getAttribute('mode');
230 if (contentType === null) {
231 contentType = 'text/plain';
232 }
233 if (contentMode === null) {
234 contentMode = 'xml';
235 }
236 if (contentType === 'application/xhtml+xml') {
237 let htmlText;
238 if (contentMode === 'xml') {
239 return textOnly ? containerElement.textContent.trim() :
240 containerElement.innerHTML;
241 } else if (contentMode === 'escaped') {
242 htmlText = containerElement.textContent;
243 } else if (contentMode === 'base64') {
244 htmlText = base64Decode(containerElement.textContent);
245 }
246 if (typeof htmlText === 'undefined') {
247 return;
248 }
249 if (textOnly) {
250 let htmlDocument = new DOMParser().parseFromString(htmlText,
251 'application/xhtml+xml');
252 if (htmlDocument.documentElement.namespaceURI ===
253 XMLNS.PARSERERROR) {
254 return;
255 }
256 return htmlDocument.body.textContent.trim();
257 }
258 return htmlText;
259 } else if (contentType === 'text/html') {
260 let htmlText;
261 if (contentMode === 'escaped') {
262 htmlText = containerElement.textContent;
263 } else if (contentMode === 'base64') {
264 htmlText = base64Decode(containerElement.textContent);
265 }
266 if (typeof htmlText === 'undefined') {
267 return;
268 }
269 if (textOnly) {
270 let htmlDocument = new DOMParser().parseFromString(htmlText,
271 'text/html');
272 return htmlDocument.body.textContent.trim();
273 }
274 return htmlText;
275 } else if (contentType === 'text/plain') {
276 let text;
277 if (contentMode === 'escaped') {
278 text = containerElement.textContent;
279 } else if (contentMode === 'base64') {
280 text = base64Decode(containerElement.textContent);
281 }
282 if (typeof text === 'undefined') {
283 return;
284 }
285 return textOnly ? text : `<pre>${encodeXML(text)}</pre>`;
286 }
287 return;
288 }
289
290 parseAtom03Content(contentElement) {
291 // ordered from lowest to highest preference
292 const contentTypes = [
293 'text/plain',
294 'text/html',
295 'application/xhtml+xml'
296 ];
297 if (contentElement.getAttribute('type') === 'multipart/alternative' &&
298 contentElement.getAttribute('mode') === null) {
299 // select alternative according to above preference
300 let selectedTypeIndex = -1;
301 let selectedElement;
302 for (let innerContentElement of contentElement.children) {
303 if (innerContentElement.localName !== 'content' ||
304 innerContentElement.namespaceURI !== XMLNS.ATOM03) {
305 throw new TypeError('child elements of a multipart ' +
306 ' content elements must be content elements');
307 }
308 let innerContentType = innerContentElement.getAttribute('type');
309 if (innerContentType === null) {
310 innerContentType = 'text/plain';
311 }
312 let typeIndex = contentTypes.indexOf(innerContentType);
313 if (typeIndex > selectedTypeIndex) {
314 selectedTypeIndex = typeIndex;
315 selectedElement = innerContentElement;
316 }
317 }
318 if (selectedTypeIndex >= 0) {
319 contentElement = selectedElement;
320 }
321 }
322
323 return this.parseAtom03ContentConstruct(contentElement, false);
324 }
325
326 parseAtom03Entry(entryElement) {
327 let title;
328 let link;
329 let date;
330 let content;
331 let titleElement = feedQueryXPath(this.document, entryElement,
332 './atom03:title');
333 if (titleElement !== null) {
334 title = titleElement.textContent.trim();
335 }
336
337 let linkElement = feedQueryXPath(this.document, entryElement,
338 './atom03:link[@href][@rel="alternate"]');
339 if (linkElement !== null) {
340 link = parseURL(linkElement.getAttribute('href'), this.url);
341 }
342
343 let modifiedElement = feedQueryXPath(this.document, entryElement,
344 './atom03:modified');
345 if (modifiedElement !== null) {
346 date = parseDate(modifiedElement.textContent);
347 }
348
349 let contentElement = feedQueryXPath(this.document, entryElement,
350 './atom03:content');
351 if (contentElement !== null) {
352 try {
353 content = this.parseAtom03Content(contentElement);
354 } catch (e) {
355 if (!(e instanceof TypeError)) {
356 throw e;
357 }
358 }
359 }
360 if (typeof content === 'undefined') {
361 let summaryElement = feedQueryXPath(this.document, entryElement,
362 './atom03:summary');
363 if (summaryElement !== null) {
364 try {
365 content = this.parseAtom03ContentConstruct(summaryElement,
366 false);
367 } catch (e) {
368 if (!(e instanceof TypeError)) {
369 throw e;
370 }
371 }
372 }
373 }
374
375 return new FeedEntry({title, link, date, content});
376 }
377
378 parseAtom03Feed() {
379 let title;
380 let subtitle;
381 let logo;
382 let entries = [];
383 let documentElement = this.document.documentElement;
384
385 let titleElement = feedQueryXPath(this.document, documentElement,
386 './atom03:title');
387 if (titleElement !== null) {
388 try {
389 title = this.parseAtom03ContentConstruct(titleElement);
390 } catch (e) {
391 if (!(e instanceof TypeError)) {
392 throw e;
393 }
394 }
395 }
396
397 let taglineElement = feedQueryXPath(this.document, documentElement,
398 './atom03:tagline');
399 if (taglineElement !== null) {
400 try {
401 title = this.parseAtom03ContentConstruct(taglineElement);
402 } catch (e) {
403 if (!(e instanceof TypeError)) {
404 throw e;
405 }
406 }
407 }
408
409 let logoElement = feedQueryXPath(this.document, documentElement,
410 './atom03:logo');
411 if (logoElement !== null) {
412 try {
413 logo = this.parseAtomLogo(logoElement);
414 } catch (e) {
415 if (!(e instanceof TypeError)) {
416 throw e;
417 }
418 }
419 }
420
421 let entryElements = feedQueryXPathAll(this.document, documentElement,
422 './atom03:entry');
423 for (let entryElement of entryElements) {
424 entries.push(this.parseAtom03Entry(entryElement));
425 }
426
427 return new Feed(this.url, {title, subtitle, logo, entries});
428 }
429
212 parseAtomLogo(logoElement) { 430 parseAtomLogo(logoElement) {
213 let url = parseURL(logoElement.textContent.trim(), this.url); 431 let url = parseURL(logoElement.textContent.trim(), this.url);
214 if (url === null) { 432 if (url === null) {
215 throw new TypeError('invalid URL in <logo> element'); 433 throw new TypeError('invalid URL in <logo> element');
216 } 434 }
547 765
548 parseFromString(xmlString, url) { 766 parseFromString(xmlString, url) {
549 this.url = url; 767 this.url = url;
550 this.document = new DOMParser().parseFromString(xmlString, 768 this.document = new DOMParser().parseFromString(xmlString,
551 'application/xml'); 769 'application/xml');
552 if (this.document.documentElement.nodeName.toLowerCase() === 770 if (this.document.documentElement.namespaceURI === XMLNS.PARSERERROR) {
553 'parsererror') {
554 throw new ParserError(this.document.documentElement.textContent); 771 throw new ParserError(this.document.documentElement.textContent);
555 } 772 }
556 773
557 let [type, version] = this.constructor.probeFeed(this.document); 774 let [type, version] = this.constructor.probeFeed(this.document);
558 if (type === 'atom') { 775 if (type === 'atom') {
559 return this.parseAtomFeed(); 776 if (version === '0.3') {
777 return this.parseAtom03Feed();
778 } else if (version === '1.0') {
779 return this.parseAtomFeed();
780 }
560 } else if (type === 'rss') { 781 } else if (type === 'rss') {
561 if (version === '0.9') { 782 if (version === '0.9') {
562 return this.parseRSS1Feed(); 783 return this.parseRSS1Feed();
563 } else { 784 } else {
564 return this.parseRSS2Feed(); 785 return this.parseRSS2Feed();