comparison js/feed-parser.js @ 6:5d7c13e998e9

Create feed previews using a stream filter Instead of replacing the feed document with an XHTML preview from a content script after it has already been rendered, create an XHTML preview using a stream filter before it is passed into the rendering engine and use an XSL style sheet to convert it to HTML. This has two advantages, firstly it results in an HTMLDocument with the full HTML DOM available and secondly it avoids rendering the document twice. Refactor the feed preview creation and split parsing and rendering into seperate modules.
author Guido Berhoerster <guido+feed-preview@berhoerster.name>
date Thu, 08 Nov 2018 16:30:34 +0100
parents
children 2bbb7617dd13
comparison
equal deleted inserted replaced
5:341a0f4b7ce0 6:5d7c13e998e9
1 /*
2 * Copyright (C) 2018 Guido Berhoerster <guido+feed-preview@berhoerster.name>
3 *
4 * This Source Code Form is subject to the terms of the Mozilla Public
5 * License, v. 2.0. If a copy of the MPL was not distributed with this
6 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
7 */
8
9 'use strict';
10
11 export const XMLNS = {
12 ATOM10: 'http://www.w3.org/2005/Atom',
13 RSS09: 'http://my.netscape.com/rdf/simple/0.9/'
14 }
15 const ALLOWED_LINK_PROTOCOLS = new Set(['http:', 'https:', 'ftp:']);
16
17 function encodeXML(str) {
18 return str.replace(/[<>&'"]/g, c => {
19 switch (c) {
20 case '<': return '&lt;';
21 case '>': return '&gt;';
22 case '&': return '&amp;';
23 case '\'': return '&apos;';
24 case '"': return '&quot;';
25 }
26 });
27 }
28
29 function parseDate(s) {
30 let date = new Date(s);
31
32 return isNaN(date) ? new Date(0) : date;
33 }
34
35 function parseURL(text, baseURL = '') {
36 let url;
37
38 try {
39 url = new URL(text, baseURL);
40 } catch (e) {
41 return null;
42 }
43 if (!ALLOWED_LINK_PROTOCOLS.has(url.protocol)) {
44 return null;
45 }
46
47 return url;
48 }
49
50 function feedNSResolver(prefix) {
51 switch (prefix) {
52 case 'atom':
53 return XMLNS.ATOM10;
54 case 'rss':
55 return XMLNS.RSS09;
56 }
57 return null;
58 }
59
60 function feedQueryXPath(feedDocument, scopeElement, xpathQuery) {
61 return feedDocument.evaluate(xpathQuery, scopeElement, feedNSResolver,
62 XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;
63 }
64
65 function feedQueryXPathAll(feedDocument, scopeElement, xpathQuery) {
66 let result = feedDocument.evaluate(xpathQuery, scopeElement, feedNSResolver,
67 XPathResult.ORDERED_NODE_ITERATOR_TYPE, null);
68 let nodes = [];
69 for (let node = result.iterateNext(); node !== null;
70 node = result.iterateNext()) {
71 nodes.push(node);
72 }
73
74 return nodes;
75 }
76
77 export class ParserError extends Error {
78 constructor() {
79 super(...arguments);
80 this.name = this.constructor.name;
81 }
82 }
83
84 export class UnsupportedFeedTypeError extends Error {
85 constructor(message = 'Document is not a supported feed', ...params) {
86 super(message, ...params);
87 this.name = this.constructor.name;
88 }
89 }
90
91 export class ProtocolError extends Error {
92 constructor(url, status, statusText, ...params) {
93 let message = `Protocol error: Transfer of ${url} failed with: ` +
94 `${status} ${statusText}`
95 super(message, ...params);
96 this.name = this.constructor.name;
97 this.url = url;
98 this.status = status;
99 this.statusText = statusText;
100 }
101 }
102
103 class FeedLogo {
104 constructor(url, {title = ''} = {}) {
105 this.url = url;
106 this.title = title;
107 }
108 }
109
110 class FeedEntryFile {
111 constructor(url, {type = browser.i18n.getMessage('defaultFileType'),
112 size = 0} = {}) {
113 this.filename = undefined;
114 this._url = undefined;
115 this.url = url;
116 this.type = type;
117 this.size = size;
118 }
119
120 set url(url) {
121 this._url = url;
122 let filename = url.pathname.split('/').pop();
123 this.filename = filename !== '' ? filename :
124 browser.i18n.getMessage('defaultFileName');
125 }
126
127 get url() {
128 return this._url;
129 }
130 }
131
132 class FeedEntry {
133 constructor({title = browser.i18n.getMessage('defaultFeedEntryTitle'),
134 link = undefined, date = new Date(0), content = '',
135 files = []} = {}) {
136 this.title = title;
137 this.link = link;
138 this.date = date;
139 this._content = undefined;
140 this.content = content;
141 this.files = files;
142 }
143
144 normalizeContent(text) {
145 if (typeof text === 'undefined') {
146 return
147 }
148
149 let contentDocument = document.implementation.createHTMLDocument();
150 let parsedDocument = new DOMParser().parseFromString(text, 'text/html');
151 contentDocument.body = contentDocument.adoptNode(parsedDocument.body);
152 return new XMLSerializer().serializeToString(contentDocument);
153 }
154
155 set content(content) {
156 this._content = this.normalizeContent(content);
157 }
158
159 get content() {
160 return this._content;
161 }
162 }
163
164 class Feed {
165 constructor(url, {title = browser.i18n.getMessage('defaultFeedTitle'),
166 subtitle = '', logo, entries = []} = {}) {
167 this.url = url;
168 this.title = title;
169 this.subtitle = subtitle;
170 this.logo = logo;
171 this.entries = entries;
172 }
173 }
174
175 export class FeedParser {
176 static probeFeed(feedDocument) {
177 let documentElement = feedDocument.documentElement;
178 if (documentElement.nodeName === 'feed' &&
179 documentElement.namespaceURI === XMLNS.ATOM10) {
180 let version = documentElement.getAttribute('version');
181 if (version === null) {
182 version = '1.0';
183 }
184 if (version === '1.0') {
185 return ['atom', version];
186 }
187 } else if (documentElement.nodeName === 'rss') {
188 let version = documentElement.getAttribute('version');
189 switch (version) {
190 case '0.90':
191 case '0.91':
192 case '0.92':
193 case '0.93':
194 case '0.94':
195 case '2.0':
196 return ['rss', version];
197 }
198 } else if (documentElement.localName.toLowerCase() === 'rdf' &&
199 documentElement.getAttribute('xmlns') === XMLNS.RSS09) {
200 return ['rss', '0.9'];
201 }
202
203 return [undefined, undefined];
204 }
205
206 constructor() {
207 this.url = undefined;
208 this.document = undefined;
209 }
210
211 parseAtomLogo(logoElement) {
212 let url = parseURL(logoElement.textContent.trim(), this.url);
213 if (url === null) {
214 throw new TypeError('invalid URL in <logo> element');
215 }
216 return new FeedLogo(url);
217 }
218
219 parseAtomEntry(entryElement) {
220 let title;
221 let link;
222 let date;
223 let content;
224 let titleElement = feedQueryXPath(this.document, entryElement,
225 './atom:title');
226 if (titleElement !== null) {
227 title = titleElement.textContent.trim();
228 }
229
230 let linkElement = feedQueryXPath(this.document, entryElement,
231 './atom:link[@href][@rel="alternate"]');
232 if (linkElement !== null) {
233 link = parseURL(linkElement.getAttribute('href'), this.url);
234 }
235
236 let updatedElement = feedQueryXPath(this.document, entryElement,
237 './atom:updated');
238 if (updatedElement !== null) {
239 date = parseDate(updatedElement.textContent);
240 }
241
242 let contentElement = feedQueryXPath(this.document, entryElement,
243 './atom:content');
244 if (contentElement === null) {
245 contentElement = feedQueryXPath(this.document, entryElement,
246 './atom:summary');
247 }
248 if (contentElement !== null) {
249 let contentType = contentElement.getAttribute('type');
250 if (contentType === null) {
251 contentType = 'text';
252 }
253 contentType = contentType.toLowerCase();
254 if (contentType === 'xhtml') {
255 content = contentElement.innerHTML;
256 } else if (contentType === 'html') {
257 content = contentElement.textContent;
258 } else {
259 let encodedContent =
260 encodeXML(contentElement.textContent.trim());
261 content = `<pre>${encodedContent}</pre>`;
262 }
263 }
264
265 return new FeedEntry({title, link, date, content});
266 }
267
268 parseAtomFeed() {
269 let title;
270 let subtitle;
271 let logo;
272 let entries = [];
273 let documentElement = this.document.documentElement;
274
275 let titleElement = feedQueryXPath(this.document, documentElement,
276 './atom:title');
277 if (titleElement !== null) {
278 title = titleElement.textContent.trim();
279 }
280
281 let subtitleElement = feedQueryXPath(this.document, documentElement,
282 './atom:subtitle');
283 if (subtitleElement !== null) {
284 subtitle = subtitleElement.textContent.trim();
285 }
286
287 let logoElement = feedQueryXPath(this.document, documentElement,
288 './atom:logo');
289 if (logoElement !== null) {
290 try {
291 logo = this.parseAtomLogo(logoElement);
292 } catch (e) {
293 if (!(e instanceof TypeError)) {
294 throw e;
295 }
296 }
297 }
298
299 let entryElements = feedQueryXPathAll(this.document, documentElement,
300 './atom:entry');
301 for (let entryElement of entryElements) {
302 entries.push(this.parseAtomEntry(entryElement));
303 }
304
305 return new Feed(this.url, {title, subtitle, logo, entries});
306 }
307
308 parseRSS1Logo(imageElement) {
309 let title;
310 let urlElement = feedQueryXPath(this.document, imageElement,
311 './rss:url');
312 if (urlElement === null) {
313 throw new TypeError('missing <url> element in <logo> element');
314 }
315 let url = parseURL(urlElement.textContent.trim(), this.url);
316 if (url === null) {
317 throw new TypeError('invalid URL in <logo> element');
318 }
319
320 let titleElement = feedQueryXPath(this.document, imageElement,
321 './rss:title');
322 if (titleElement !== null) {
323 title = titleElement.textContent.trim();
324 }
325
326 return new FeedLogo(url, {title});
327 }
328
329 parseRSS1Entry(itemElement) {
330 let title;
331 let link;
332 let titleElement = feedQueryXPath(this.document, itemElement,
333 './rss:title');
334 if (titleElement !== null) {
335 title = titleElement.textContent;
336 }
337
338 let linkElement = feedQueryXPath(this.document, itemElement,
339 './rss:link');
340 if (linkElement !== null) {
341 link = parseURL(linkElement.textContent, this.url);
342 }
343
344 return new FeedEntry({title, link});
345 }
346
347 parseRSS1Feed() {
348 let title;
349 let subtitle;
350 let logo;
351 let entries = [];
352 let documentElement = this.document.documentElement;
353 let titleElement = feedQueryXPath(this.document, documentElement,
354 './rss:channel/rss:title');
355 if (titleElement !== null) {
356 title = titleElement.textContent;
357 }
358
359 let descriptionElement = feedQueryXPath(this.document, documentElement,
360 './channel/description');
361 if (descriptionElement !== null) {
362 subtitle = descriptionElement.textContent;
363 }
364
365 let imageElement = feedQueryXPath(this.document, documentElement,
366 './rss:image');
367 if (imageElement !== null) {
368 try {
369 logo = this.parseRSS1Logo(imageElement);
370 } catch (e) {
371 if (!(e instanceof TypeError)) {
372 throw e;
373 }
374 }
375 }
376
377 let itemElements = feedQueryXPathAll(this.document, documentElement,
378 './rss:item');
379 for (let itemElement of itemElements) {
380 let entry = this.parseRSS1Entry(itemElement);
381 if (typeof entry !== 'undefined') {
382 entries.push(entry);
383 }
384 }
385
386 return new Feed(this.url, {title, subtitle, logo, entries});
387 }
388
389 parseRSS2Logo(imageElement) {
390 let title;
391 let urlElement = feedQueryXPath(this.document, imageElement, './url');
392 if (urlElement === null) {
393 throw new TypeError('missing <url> element in <logo> element');
394 }
395 let url = parseURL(urlElement.textContent.trim(), this.url);
396 if (url === null) {
397 throw new TypeError('invalid URL in <logo> element');
398 }
399
400 let titleElement = feedQueryXPath(this.document, imageElement,
401 './title');
402 if (titleElement !== null) {
403 title = titleElement.textContent.trim();
404 }
405
406 return new FeedLogo(url, {title});
407 }
408
409 parseRSS2EntryFile(enclosureElement) {
410 let type;
411 let size;
412 let url = parseURL(enclosureElement.getAttribute('url'), this.url);
413 if (url === null) {
414 throw new TypeError('invalid URL in <enclosure> element');
415 }
416
417 let typeAttribute = enclosureElement.getAttribute('type');
418 if (typeAttribute !== null) {
419 type = typeAttribute;
420 }
421
422 let length = parseInt(enclosureElement.getAttribute('length'),
423 10);
424 if (!isNaN(length)) {
425 size = length;
426 }
427
428 return new FeedEntryFile(url, {type, size});
429 }
430
431 parseRSS2Entry(itemElement) {
432 let title;
433 let link;
434 let date;
435 let content;
436 let files = [];
437 let titleElement = feedQueryXPath(this.document, itemElement,
438 './title');
439 if (titleElement !== null) {
440 title = titleElement.textContent;
441 }
442
443 let linkElement = feedQueryXPath(this.document, itemElement, './link');
444 if (linkElement !== null) {
445 link = parseURL(linkElement.textContent, this.url);
446 }
447
448 let pubDateElement = feedQueryXPath(this.document, itemElement,
449 './pubDate');
450 if (pubDateElement !== null) {
451 date = parseDate(pubDateElement.textContent);
452 }
453
454 let descriptionElement = feedQueryXPath(this.document, itemElement,
455 './description');
456 if (descriptionElement !== null) {
457 content = descriptionElement.textContent.trim();
458 }
459
460 for (let enclosureElement of
461 feedQueryXPathAll(this.document, itemElement, './enclosure')) {
462 try {
463 let entryFile = this.parseRSS2EntryFile(enclosureElement);
464 files.push(entryFile);
465 } catch (e) {
466 if (!(e instanceof TypeError)) {
467 throw e;
468 }
469 }
470 }
471
472 return new FeedEntry({title, link, date, content, files});
473 }
474
475 parseRSS2Feed() {
476 let title;
477 let subtitle;
478 let logo;
479 let entries = [];
480 let documentElement = this.document.documentElement;
481 let titleElement = feedQueryXPath(this.document, documentElement,
482 './channel/title');
483 if (titleElement !== null) {
484 title = titleElement.textContent;
485 }
486
487 let descriptionElement = feedQueryXPath(this.document, documentElement,
488 './channel/description');
489 if (descriptionElement !== null) {
490 subtitle = descriptionElement.textContent;
491 }
492
493 let imageElement = feedQueryXPath(this.document, documentElement,
494 './channel/image');
495 if (imageElement !== null) {
496 try {
497 logo = this.parseRSS2Logo(imageElement);
498 } catch (e) {
499 if (!(e instanceof TypeError)) {
500 throw e;
501 }
502 }
503 }
504
505 let itemElements = feedQueryXPathAll(this.document, documentElement,
506 './channel/item');
507 for (let itemElement of itemElements) {
508 let entry = this.parseRSS2Entry(itemElement);
509 if (typeof entry !== 'undefined') {
510 entries.push(entry);
511 }
512 }
513
514 return new Feed(this.url, {title, subtitle, logo, entries});
515 }
516
517 parseFromString(xmlString, url) {
518 this.url = url;
519 this.document = new DOMParser().parseFromString(xmlString,
520 'application/xml');
521 if (this.document.documentElement.nodeName.toLowerCase() ===
522 'parsererror') {
523 throw new ParserError(this.document.documentElement.textContent);
524 }
525
526 let [type, version] = this.constructor.probeFeed(this.document);
527 if (type === 'atom') {
528 return this.parseAtomFeed();
529 } else if (type === 'rss') {
530 if (version === '0.9') {
531 return this.parseRSS1Feed();
532 } else {
533 return this.parseRSS2Feed();
534 }
535 }
536 throw new UnsupportedFeedTypeError();
537 }
538 }