Hello all,
I am working on a nodejs application whos e purpose is to search DOCX, PDF, and XLSX documents and return a csv with the results.
It is invoked via the command line, and accepts two arguments: The folder to search documents, and a text file containing search terms separated by newline.
It is frustratingly slow and has a memory leak somewhere. For example, when I read a library of ~3GB of PDFs the RAM usage climbs to almost 2GB!! I’m fearful that on very large folders, the js heap will run out of memory.
Here is everything declared in the global scope:
/**
* filesystem: provides methods used to validate the directory input, search terms, and retrieve file names.
* docx: searchDocx method returns an array of match objects, or a blank array if no terms are found.
* pdf: searchPdf method returns an array of match objects, or a blank array if no terms are found.
* xlsx: searchXlsx method returns an array of match objects, or a blank array if no terms are found.
* csv: outputState class controls csv output
*/
const fsCheck = require('./fileSystem')
const pdf = require('./pdf')
const docx = require('./docx')
const xlsx = require('./xlsx')
const csv = require('./csv')
// Directory arg
const searchDir = process.argv[2]
// Search terms arg
const searchTermFile = process.argv[3]
const usageMessage = 'Syntax: ./app ./directory/to/source/documents ./searchTermFile.txt'
// Match object
function match(searchTerm, file) {
this.searchTerm = searchTerm
this.file = file
}
I store each match found into the match object, and then push that to an array.
Here are the two functions I use to “drive” the application logic…
generateSearches
async function generateSearches(dirInput, searchInput) {
try {
var filesNoDir = []
var files = []
// var matches = []
var outputState = new csv.outputState
outputState.init()
const searchTerms = fsCheck.validSearchTermInput(searchInput)
if (fsCheck.validDirInput(dirInput) && searchTerms !== undefined) {
files = fsCheck.constructPaths(dirInput)
filesNoDir = fsCheck.getFiles(dirInput)
} else {
console.log(usageMessage)
return process.exit(1)
}
for (let [index, value] of files.entries()) {
var percentage = ( index / files.length ) * 100
console.log(`Files read: ${index} Files to read: ${files.length - index} Files total: ${files.length}`)
console.log(`Percentage complete: ${percentage}%`)
var result = await routeFile(value, searchTerms, filesNoDir[index], dirInput)
if (result !== undefined) {
var matches = []
matches.push(result)
var flattened = [].concat.apply([], matches)
outputState.appendMatches(flattened)
matches = []
}
}
console.log('Done!')
} catch(err) {
console.log(err)
}
}
routeFile
async function routeFile(file, searchTerms, fileName, directory) {
try {
let extension = file.slice(-4)
switch(extension) {
case '.pdf':
try {
console.log('Reading file ' + file)
const pdfResults = await pdf.searchPdf(file, searchTerms, fileName)
return pdfResults
} catch (err) {
console.log('Error parsing pdf')
}
break
case 'docx':
try {
console.log('Reading file ' + file)
const docxResults = await docx.searchDocx(file, searchTerms, fileName)
return docxResults
} catch(err) {
console.log('Error parsing docx')
}
break
case 'xlsx':
try {
console.log('Reading file ' + file)
const xlsxResults = await xlsx.searchXlsx(file, searchTerms, fileName)
return xlsxResults
} catch(err) {
console.log('Error parsing xlsx')
}
}
} catch(err) {
console.log(file + ' is an unsupported filetype.')
}
}
And finally, here is my ‘./pdf’ require.
const { Index, Document, Worker } = require('flexsearch');
const index = new Index("performance");
const pdfjsLib = require("pdfjs-dist/legacy/build/pdf");
function match(searchTerm, file) {
this.searchTerm = searchTerm
this.file = file
}
/**
*
* @param {string} pathToFile The path to the folder to be searched
* @param {string} searchTerms The path to the search terms
* @returns {Object} match Returns an array of match objects
*/
async function searchPdf(pathToFile, searchTerms, fileName) {
try {
let extract = await GetTextFromPDF(pathToFile)
return searchText(extract, searchTerms, fileName)
} catch (err) {
console.log(err)
console.log('Could not parse PDF')
}
}
async function GetTextFromPDF(path) {
let strings = []
let doc = await pdfjsLib.getDocument(path).promise;
let totalPages = doc.numPages
for (let i = 1; i <= totalPages; i++) {
let currentPage = await doc.getPage(i)
let content = await currentPage.getTextContent()
let extract = content.items.map(function(item) {
return item.str
})
strings.push(extract)
}
return strings
}
function searchText(text, terms, fileName) {
for (let i = 0; i < text.length; i++) {
index.add(i, text[i])
}
const resultArr = []
for (term of terms) { // Break this loop if all terms are found
if (index.search(term).length !== 0) {
var matchObj = new match(term, fileName)
resultArr.push(matchObj)
}
}
return resultArr
}
Bottom line: I am having a lot of trouble optimizing performance for this program. I’ve tried running node with --proc flag and also leveraging 0x to generate a flame graph. But as far as I can tell, the pdfjs-dist library is a primary performance culprit. Does anyone have pointers or suggestions on how I can solve the RAM usage issue? Cheers!