Memory leak in nodejs application

liam.stojanovic · October 13, 2021, 1:00pm

Hello all,

I am working on a nodejs application whos e purpose is to search DOCX, PDF, and XLSX documents and return a csv with the results.

It is invoked via the command line, and accepts two arguments: The folder to search documents, and a text file containing search terms separated by newline.

It is frustratingly slow and has a memory leak somewhere. For example, when I read a library of ~3GB of PDFs the RAM usage climbs to almost 2GB!! I’m fearful that on very large folders, the js heap will run out of memory.

Here is everything declared in the global scope:

/**
 * filesystem: provides methods used to validate the directory input, search terms, and retrieve file names.
 * docx: searchDocx method returns an array of match objects, or a blank array if no terms are found.
 * pdf: searchPdf method returns an array of match objects, or a blank array if no terms are found.
 * xlsx: searchXlsx method returns an array of match objects, or a blank array if no terms are found.
 * csv: outputState class controls csv output
 */
const fsCheck = require('./fileSystem')
const pdf = require('./pdf')
const docx = require('./docx')
const xlsx = require('./xlsx')
const csv = require('./csv')
// Directory arg
const searchDir = process.argv[2]
// Search terms arg
const searchTermFile = process.argv[3]
const usageMessage = 'Syntax: ./app ./directory/to/source/documents ./searchTermFile.txt'

// Match object 
function match(searchTerm, file) {
    this.searchTerm = searchTerm
    this.file = file
}

I store each match found into the match object, and then push that to an array.

Here are the two functions I use to “drive” the application logic…
generateSearches

async function generateSearches(dirInput, searchInput) {
    try {
        var filesNoDir = []
        var files = []
        // var matches = []
        var outputState = new csv.outputState
        outputState.init()
        const searchTerms = fsCheck.validSearchTermInput(searchInput)
        if (fsCheck.validDirInput(dirInput) && searchTerms !== undefined) {
            files = fsCheck.constructPaths(dirInput)
            filesNoDir = fsCheck.getFiles(dirInput)
        } else {
            console.log(usageMessage)
            return process.exit(1)
        }
        for (let [index, value] of files.entries()) {
            var percentage = ( index / files.length ) * 100
            console.log(`Files read: ${index} Files to read: ${files.length - index} Files total: ${files.length}`)
            console.log(`Percentage complete: ${percentage}%`)
            var result = await routeFile(value, searchTerms, filesNoDir[index], dirInput)
            if (result !== undefined) {
                var matches = []
                matches.push(result)
                var flattened = [].concat.apply([], matches)
                outputState.appendMatches(flattened)
                matches = []
            }
        }

        console.log('Done!')
    } catch(err) {
        console.log(err)
    }
}

routeFile

async function routeFile(file, searchTerms, fileName, directory) {
    try {
        let extension = file.slice(-4)
        switch(extension) {
            case '.pdf':
                try {
                    console.log('Reading file ' + file)
                    const pdfResults = await pdf.searchPdf(file, searchTerms, fileName)
                    return pdfResults
                } catch (err) {
                    console.log('Error parsing pdf')
                }
                break
            case 'docx':
                try {
                    console.log('Reading file ' + file)
                    const docxResults = await docx.searchDocx(file, searchTerms, fileName)
                    return docxResults
                } catch(err) {
                    console.log('Error parsing docx')
                }
                break
            case 'xlsx':
                try {
                    console.log('Reading file ' + file)
                    const xlsxResults = await xlsx.searchXlsx(file, searchTerms, fileName)
                    return xlsxResults
                } catch(err) {
                    console.log('Error parsing xlsx')
                }
        }

    } catch(err) {
        console.log(file + ' is an unsupported filetype.')
    }
}

And finally, here is my ‘./pdf’ require.

const { Index, Document, Worker } = require('flexsearch');
const index = new Index("performance");
const pdfjsLib = require("pdfjs-dist/legacy/build/pdf");

function match(searchTerm, file) {
    this.searchTerm = searchTerm
    this.file = file
}

/**
 * 
 * @param {string} pathToFile The path to the folder to be searched
 * @param {string} searchTerms The path to the search terms
 * @returns {Object} match Returns an array of match objects
 */
async function searchPdf(pathToFile, searchTerms, fileName) {
    try {
        let extract = await GetTextFromPDF(pathToFile)
        return searchText(extract, searchTerms, fileName)

    } catch (err) {
        console.log(err)
        console.log('Could not parse PDF')
    }
}

async function GetTextFromPDF(path) {
    let strings = []
    let doc = await pdfjsLib.getDocument(path).promise;
    let totalPages = doc.numPages
    for (let i = 1; i <= totalPages; i++) {
        let currentPage = await doc.getPage(i)
        let content = await currentPage.getTextContent()
        let extract = content.items.map(function(item) {
            return item.str
        })
        strings.push(extract)
    }
    return strings
}

function searchText(text, terms, fileName) {
    for (let i = 0; i < text.length; i++) {
        index.add(i, text[i])
    }
    const resultArr = []
    for (term of terms) { // Break this loop if all terms are found
        if (index.search(term).length !== 0) {
            var matchObj = new match(term, fileName)
            resultArr.push(matchObj)
        }
    }
    return resultArr
}

Bottom line: I am having a lot of trouble optimizing performance for this program. I’ve tried running node with --proc flag and also leveraging 0x to generate a flame graph. But as far as I can tell, the pdfjs-dist library is a primary performance culprit. Does anyone have pointers or suggestions on how I can solve the RAM usage issue? Cheers!

lasjorg · October 13, 2021, 3:57pm

I’m sure there is a reason why you are using the legacy version but have you at least tested the library using the non-legacy one?

liam.stojanovic · October 13, 2021, 4:09pm

That’s a good point, I’ll test it with non-legacy and see what happens.

I used the legacy version because I wasn’t able to compile it (using pkg) with the non-legacy version.

DanCouper · October 14, 2021, 7:22am

I think you’re on a hiding to nothing here regardless of what you do. pdf.js is designed for viewing one PDF at a time, in a browser. In normal usage, it renders the PDF into a canvas element, streaming that single file in over HTTP, starting the render as soon as it starts to comes in. You don’t need any of this but you’re still going to get the full core of what powers that functionality – the library is specifically designed for that (it used to power Firefox’s in-browser PDF reader, though I’m not sure if that’s still the case). If you’re just extracting text, I’d strongly suggest using a library that only parses text from the raw PDF files unless you really really need to render the entire files.

system · April 14, 2022, 7:23pm

This topic was automatically closed 182 days after the last reply. New replies are no longer allowed.