blob: c8d5d8675f89b2f50ff794b5d827193717c68a6a [file] [log] [blame]
// Copyright 2015 The Vanadium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Returns a stream that consumes filenames corresponding to HTML pages, and
// emits anchor urls extracted from these HTML pages. Emits any given url at
// most once. Strips fragments from urls, and does not emit fragment-only
// URLs.
var cheerio = require('cheerio');
var fs = require('graceful-fs');
var path = require('path');
var through = require('through2');
module.exports = create;
function create() {
return through.obj(write);
// Called every-time a file is emitted from the powerwalk stream.
function write(buffer, enc, callback) {
var stream = this;
var file = buffer.toString();
// Do not scrape non-html files.
if (path.extname(file) !== '.html') {
callback();
return;
}
fs.readFile(file, function ondata(err, data) {
if (err) {
callback(err);
return;
}
// Scrape links.
var $ = cheerio.load(data);
$('a').each(function(i, element) {
var href = $(element).attr('href') || '';
// Remove trailing fragements.
href = href.replace(/html#(.*)/, 'html');
var isFragement = !!href.match(/^#/);
// Filter out any empty hrefs, or if the href is only an anchor tag (a
// link to a heading in the current document).
if (!href || isFragement) {
return;
}
// Ignore external links.
// NOTE: This could be made as an option if there is a need to test
// outbound links.
var isExternal = !!href.match(/^http/);
if (isExternal && !stream._external) {
return;
}
stream.push({
destination: href,
source: file
});
});
callback();
});
}
}