Web Scraper Outputs Just One Item Instead of Many

So I tried to learn how to do this on my own by taking two video courses about node and cheerio on udemy.com but I haven’t gotten any better at it. The code produces an array of image URLs when I do this:

const request = require("request-promise");
const cheerio = require("cheerio");

const url = "https://www.example.com";

const scrapeResults = [];
async function scrapeJobHeader() {
  try {
    const htmlResult = await request.get(url);
    const $ = await cheerio.load(htmlResult);
	$("td.productListing-data > a ").each((index, element) => {
      const resultTitle = $(element).children("img");
	  
	  const img_url = resultTitle.attr("src");
	  
      const scrapeResult = { img_url };
    
      scrapeResults.push(scrapeResult);
    });
    return scrapeResults;
  } catch (err) {
    console.error(err);
  }
}

async function scrapeWebsite() {
  const jobsWithHeaders = await scrapeJobHeader();
 console.log(jobsWithHeaders);

}

scrapeWebsite();

but I get the error “TypeError: Cannot read property ‘replace’ of undefined” when I do this:

const img_url = resultTitle.attr("src").replace("images\\/more_color.png","");

The images that are returned include “images/more_color.png” but I just want to return the actual product images and not the pngs.

The results look like this:

{ img_url: 'images/more_color.png' },
  { img_url: undefined },
  { img_url: undefined },
  {
    img_url: 'images/20191206/thumb/AK1501-@RH-CRY-LOVE@22X06-825_3L@467400@200@01@200.jpg'
  },
  { img_url: 'images/more_color.png' },
  { img_url: undefined },
  { img_url: undefined },
  {
    img_url: 'images/20191206/thumb/AK1501-@GD-CRY-LOVE@22X06-825_3L@467399@200@01@200.jpg'
  },
  { img_url: 'images/more_color.png' },
  { img_url: undefined },
  { img_url: undefined },
  {
    img_url: 'images/20191206/thumb/AK1500-@GD-CRY-QUEEN@3X06-825_3L@467397@200@01@200.jpg'
  },
  { img_url: 'images/more_color.png' },
  { img_url: undefined },
  { img_url: undefined },

The next step would be to get rid of the undefined image URLs but I haven’t gotten that far yet.

I got rid of the undefined ones by using:

  if (img_url != undefined){
      scrapeResults.push(scrapeResult);
	 }

You can check the Truthiness of the img string and see if it does not end with .png before pushing to the array.

async function scrapeJobHeader() {
  try {
    const htmlResult = await request.get(baseURL);
    const $ = await cheerio.load(htmlResult);
    $('td.productListing-data > a ').each((index, element) => {
      const resultTitle = $(element).children('img');
      if (
        resultTitle.attr('src') &&
        !resultTitle.attr('src').endsWith('.png')
      ) {
        let img_url = resultTitle.attr('src');
        scrapeResults.push({ img_url });
      }
    });
    return scrapeResults;
  } catch (err) {
    console.error(err);
  }
}

Thank you lasjorg! That worked!

No problem, glad to help. I should have linked to the endsWith method docs.

I suggest you keep these three links handy, on the left-hand side is the list of all the methods on the different objects (Object, Array, String).



Happy coding!