Node.js get into subpages - Content scraper

Hello everyone can you please help me I’m trying to get into a subpages with a https.request Node.js . To get Data from the page I using cheerio. I’m not using any others Modules …

const fs = require('fs');
const https = require('https');
const cheerio = require('cheerio');

let pageLink;
let siteProducts = "";

const options = {
    host: 'shirts4mike.com',
    path: '/shirts.php',
}

let pageLinks = []; //to save my href attributes I got from my first https.resquest

//connect with the website
const request = https.request(options, function(response){
    //console.log(options);
    
    response.on('data', function (chunk) {
        const $ = cheerio.load(chunk);

        $('.products li a').each( function(linkIndex) {

            pageLink = $(this).attr('href');
            pageLinks.push(pageLink);

        });  

        //loop the links I save in var let pageLinks
        for(var i = 0; i < pageLinks.length; i+= 1){ 

          let pages =  pageLinks[i];

          /** here does not work!  : */
          const requestPages = https.request((options.host + pages), function(response){
            response.on('data', function (chunk) {
                siteProducts += chunk; console.log(siteProducts);
            });
          })
          /** end hier  */

        }
    });
    
    response.on('end', function (){
        console.log('no more data in response');
    });

});

I just want to do another https request with the Links I have saved in the variable let pageLinks and Loop it to get the Data I want , but it does not work. I don’t know if it is possible to do a https.request into another https.request … :disappointed_relieved:

Does your first request even work?

I think you may be missing a couple of cases where you should be calling request.end(); and requestPages.end();

After that, (options.host + pages) won’t be a valid URL, it’ll be something like shirts4mike.comshirts.php?shirt=101 or something (missing https:// at the start and then the intermediate '/')

Hi gebulmer! no I don’t missing anything and my first request works perfectly until this block of code …

/** here does not work!  : */
          const requestPages = https.request((options.host + pages), function(response){
            response.on('data', function (chunk) {
                siteProducts += chunk; console.log(siteProducts);
            });
          })
/** end hier  */ 

You can console let pages = pageLinks[i]; and see what I already got from https://www.shirts4mike.com/shirt.php
But now the question ist : How can I do another https.request in another https.request? How can you do this block Code to work? : I added an Slash (/)

 for(var i = 0; i < pageLinks.length; i+= 1){
          let pages =  '/' + pageLinks[i];
          /**It does not work! this : */
            // console.log(pages);
            const requestPages = https.request(options.host + pages, function(response){
                response.on('data', function (chunk) {
                    siteProducts += chunk; console.log(siteProducts);
                });
            })
          /** end hier  */
        }

Here’s your code with the changes that I mentioned, does this work for you?

const https = require('https');
const cheerio = require('cheerio');

let pageLink;
let siteProducts = '';

const options = {
    host: 'shirts4mike.com',
    path: '/shirts.php',
};

let pageLinks = []; //to save my href attributes I got from my first https.resquest

//connect with the website
const request = https.request(options, function(response){
    //console.log(options);

    console.log('Reached inside request one');
    console.log(response);

    response.on('data', function (chunk) {
        const $ = cheerio.load(chunk);
        console.log('Reached inside the chunked response');

        $('.products li a').each( function() {

            pageLink = $(this).attr('href');
            pageLinks.push(pageLink);

        });

        console.log(pageLinks);
        //loop the links I save in var let pageLinks
        for(var i = 0; i < pageLinks.length; i+= 1){

            let pages =  pageLinks[i];

            /** here does not work!  : */
            const request2 = https.request(('https://' + options.host + '/' + pages), function(response){
                response.on('data', function (chunk) {
                    siteProducts += chunk; console.log(siteProducts);
                });
            });
            /** end hier  */
            request2.end();
        }
    });

    response.on('end', function (){
        console.log('no more data in response');
    });


});

request.end();
1 Like

I should probably clarify why I think your code wasn’t working

Running the code I posted with the changes works, so assuming I’m right:

  • requestPages didn’t have requestPages.end() called, so the requests didn’t work like you think they should,
  • request got away with not having end called by existing at the end of the file

The url part was just the problem after that

Thank very much! I understand.
Do you know why by the first request it works without 'https://' ?
I just gave the options variable like this: const request = https.request(options, function(response) and it worked. But by the second request I needed this : https.request(('https://' + options.host + '/' + pages), function(response)
I am confuse know…

Yes it does seem a bit odd to need the https there

The reason is basically in how the node https library handles the argument

If the argument is a string it tries to parse the string as a URL to create an options object, and that’s where it needs the protocol, it assumes that you’re setting a protocol yourself in the string to be parsed

Otherwise if the first argument is the options object directly it adds the default protocol to the object

It’s a bit of common pitfall I think