Hello people! I have this little snippet of code (which I’m gonna leave below), which however is not working. I am building a scraper, and for readability I am splitting it into functions. However, the second step (in the last two lines) keeps returning an empty array, and I don’t understand why since the await should force the axios request to pull through before returning its value. What it should return instead is an array of links.
Can someone please take a look and help me figure out what is wrong?
const path = require("path");
const fs = require("fs");
const axios = require("axios");
const cheerio = require("cheerio");
const createDirectory = (name) => {
const dirName = path.join(__dirname, name);
if (fs.existsSync(dirName)) return;
fs.mkdirSync(dirName);
};
const getCategories = async (cityUrl) => {
const baseUrl = "https://graffiti-database.com";
const cityCategories = [];
await axios
.get(cityUrl + "/categories")
.then((res) => {
const $ = cheerio.load(res.data);
$("p.image-info > a").each(function () {
let i = $(this).attr("href");
cityCategories.push(baseUrl + i);
});
})
.catch((err) => console.error(">> Error retrieving categories: ", err));
return cityCategories;
};
const getPagesForCategories = (categories) => {
const allPages = [];
categories.forEach(async (category) => {
await axios.get(category).then((res) => {
const $ = cheerio.load(res.data);
const lastPage = $("li.page-item")
.text()
.split("\t\t\t\t\t\t\t")
.slice(-3, -2)[0]
.replace(/[\t\n]/g, "");
// allPages.push(`${category}?page=${lastPage}`);
// console.log(allPages);
for (page = 1; page <= lastPage; page++) {
allPages.push(`${category}?page=${page}`);
}
//console.log(allPages);
});
});
return allPages;
};
getCategories("https://graffiti-database.com/Italy/Milan")
.then((categories) => getPagesForCategories(categories))
.then((pages) => console.log(pages));