Python Asyncio/AioHttp Store Scraped Data in Specific Format

I have 4 lists mainly fabrics len --> 24, design_Name len --> 84, creator_Name len --> 84 and results len --> 2106.I want to map those values with each other in a specific manner. What I am looking for is something like this.

Expected Output:
{('Catching Fireflies', 'thestorysmith'): {'fabric_name_00': 'FABRIC_PETAL_SIGNATURE_COTTON', 'test_swatch_meter_00': 1.75, 'fat_quarter_meter_00': 10.58, 'meter_00': 18.22, 'fabric_name_01': 'FABRIC_SATIN', 'test_swatch_meter_01': 1.75, 'fat_quarter_meter_01': 11.85, 'meter_01': 19.71, 'fabric_name_02': 'FABRIC_COTTON_POPLIN_BRAVA', 'test_swatch_meter_02': 1.75, 'fat_quarter_meter_02': 11.85, 'meter_02': 19.71, 'fabric_name_03': 'FABRIC_PERFORMANCE_PIQUE', 'test_swatch_meter_03': 1.75, 'fat_quarter_meter_03': 12.9, 'meter_03': 22.65, 'fabric_name_04': 'FABRIC_CHIFFON', 'test_swatch_meter_04': 1.75, 'fat_quarter_meter_04': 12.9, 'meter_04': 22.65, 'fabric_name_05': 'FABRIC_ORGANIC_SWEET_PEA_GAUZE', 'test_swatch_meter_05': 1.75, 'fat_quarter_meter_05': 12.9, 'meter_05': 22.65, 'fabric_name_06': 'FABRIC_POLY_CREPE_DE_CHINE', 'test_swatch_meter_06': 1.75, 'fat_quarter_meter_06': 12.9, 'meter_06': 22.65, 'fabric_name_07': 'FABRIC_COTTON_LAWN_APPAREL', 'test_swatch_meter_07': 1.75, 'fat_quarter_meter_07': 11.75, 'meter_07': 21.66, 'fabric_name_08': 'FABRIC_LIGHTWEIGHT_COTTON_TWILL', 'test_swatch_meter_08': 1.75, 'fat_quarter_meter_08': 12.9, 'meter_08': 25.63, 'fabric_name_09': 'FABRIC_MODERN_JERSEY', 'test_swatch_meter_09': 1.75, 'fat_quarter_meter_09': 13.29, 'meter_09': 26.18, 'fabric_name_10': 'FABRIC_COTTON_SPANDEX_JERSEY', 'test_swatch_meter_10': 1.75, 'fat_quarter_meter_10': 13.57, 'meter_10': 26.42, 'fabric_name_11': 'FABRIC_ORGANIC_COTTON_SATEEN', 'test_swatch_meter_11': 1.75, 'fat_quarter_meter_11': 13.83, 'meter_11': 26.67, 'fabric_name_12': 'FABRIC_LINEN_COTTON_CANVAS', 'test_swatch_meter_12': 1.75, 'fat_quarter_meter_12': 14.79, 'meter_12': 27.59, 'fabric_name_13': 'FABRIC_ORGANIC_COTTON_KNIT_PRIMA', 'test_swatch_meter_13': 1.75, 'fat_quarter_meter_13': 13.83, 'meter_13': 26.67, 'fabric_name_14': 'FABRIC_FLEECE', 'test_swatch_meter_14': 1.75, 'fat_quarter_meter_14': 14.79, 'meter_14': 27.59, 'fabric_name_15': 'FABRIC_MINKY', 'test_swatch_meter_15': 1.75, 'fat_quarter_meter_15': 15.29, 'meter_15': 28.15, 'fabric_name_16': 'FABRIC_DOGWOOD_DENIM', 'test_swatch_meter_16': 1.75, 'fat_quarter_meter_16': 15.83, 'meter_16': 29.54, 'fabric_name_17': 'FABRIC_PERFORMANCE_LINEN', 'test_swatch_meter_17': 1.75, 'fat_quarter_meter_17': 16.8, 'meter_17': 31.58, 'fabric_name_18': 'FABRIC_RECYCLED_CANVAS', 'test_swatch_meter_18': 1.75, 'fat_quarter_meter_18': 16.8, 'meter_18': 31.58, 'fabric_name_19': 'FABRIC_SPORT_LYCRA', 'test_swatch_meter_19': 1.75, 'fat_quarter_meter_19': 16.8, 'meter_19': 31.58, 'fabric_name_20': 'FABRIC_CYPRESS_COTTON_BRAVA', 'test_swatch_meter_20': 1.75, 'fat_quarter_meter_20': 17.23, 'meter_20': 32.48, 'fabric_name_21': 'FABRIC_CELOSIA_VELVET', 'test_swatch_meter_21': 1.75, 'fat_quarter_meter_21': 18.7, 'meter_21': 35.46, 'fabric_name_22': 'FABRIC_PERFORMANCE_VELVET', 'test_swatch_meter_22': 1.75, 'fat_quarter_meter_22': 21.5, 'meter_22': 41.08, 'fabric_name_23': 'FABRIC_BELGIAN_LINEN', 'test_swatch_meter_23': 1.75, 'fat_quarter_meter_23': 29.54, 'meter_23': 58.17}}

I’ve tried so far:

import asyncio

import aiohttp

import json

import requests

from bs4 import BeautifulSoup

import pandas as pd

from CreateDict import CreateDict

headers = {

    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',

    'Accept': '*/*',

    'Accept-Language': 'en-US,en;q=0.5',

    'X-Spoonflower-Window-UUID': 'a9bc37a2-9eb2-4a1e-8ea1-fcee89347364',

    'Content-Type': 'application/json',

    'Origin': 'https://www.spoonflower.com',

    'Connection': 'keep-alive',

    'Referer': 'https://www.spoonflower.com/',

    'Sec-GPC': '1',

    'If-None-Match': 'W/95d6572c326b81ce98c7ae27ac449d42',

    'TE': 'Trailers',

}

def get_fabric_names():

    res = requests.get('https://www.spoonflower.com/spoonflower_fabrics')

    soup = BeautifulSoup(res.text, 'lxml')

    fabrics = [fabric.find('h2').text.strip() for fabric in soup.find_all('div', {'class': 'product_detail medium_text'})]

    fabric = [("_".join(fab.upper().replace(u"\u2122", '').split())) for fab in fabrics]

    for index in range(len(fabric)):

        if 'COTTON_LAWN_(BETA)' in fabric[index]:

            fabric[index] = 'COTTON_LAWN_APPAREL'

        elif 'COTTON_POPLIN' in fabric[index]:

            fabric[index] = 'COTTON_POPLIN_BRAVA'

        elif 'ORGANIC_COTTON_KNIT' in fabric[index]:

            fabric[index] = 'ORGANIC_COTTON_KNIT_PRIMA'

        elif 'PERFORMANCE_PIQUÉ' in fabric[index]:

            fabric[index] = 'PERFORMANCE_PIQUE'

        elif 'CYPRESS_COTTON' in fabric[index]:

            fabric[index] = 'CYPRESS_COTTON_BRAVA'

    return fabric

async def get_designEndpoint(session, url):

    """

    Get Design End Point

    :param url:

    """

    async with session.get(url) as response:

        response = await response.read()

        # print(response)

        json_response = json.loads(response.decode("utf-8"))

        extracting_endpoint = json_response['page_results']

        # extracting designId

        design_Id = [item['designId'] for item in extracting_endpoint]

        # extracting designName

        design_Name = [item['name'] for item in extracting_endpoint]

        # extracting creator_Name

        creator_Name = [item['user']['screenName'] for item in extracting_endpoint]

        return design_Id, design_Name, creator_Name

async def get_Fabric_Pricing_Data(session, url):

    """

    Extract all the pricing data with respect to Fabric type

    :param url: detail url

            :Return: json data

    """

    

    async with session.get(url) as response:

        response = await response.read()

        json_response = json.loads(response)

        #print(json_response)

        # Extracting Data

        try:

            fabric = json_response['data']['fabric_code']

        except:

            fabric = 'N/A'

        try:

            test_swatch_meter = json_response['data']['pricing']['TEST_SWATCH_METER']['price']

        except:

            test_swatch_meter = 'N/A'

        try:

            fat_quarter_meter = json_response['data']['pricing']['FAT_QUARTER_METER']['price']

        except:

            fat_quarter_meter = 'N/A'

        try:

            meter = json_response['data']['pricing']['METER']['price']

        except:

            meter = 'N/A'

        # summary = fabric + "|" + str(test_swatch_meter) + "|" + str(fat_quarter_meter) + "|" + str(meter)

        return fabric, test_swatch_meter, fat_quarter_meter, meter

async def main():

    urls = []

    tasks = []

    async with aiohttp.ClientSession(headers=headers) as session:

        fabrics = get_fabric_names()

        design_Id, design_Name, creator_Name = await get_designEndpoint(session, 'https://pythias.spoonflower.com/search/v1/designs?lang=en&page_offset=0&sort=bestSelling&product=Fabric&forSale=true&showMatureContent=false&page_locale=en')

        for item in design_Id:

            for fab_type in fabrics[0:-3]:

                price_url = 'https://api-gateway.spoonflower.com/alpenrose/pricing/fabrics/FABRIC_'+ fab_type +'?quantity=1&shipping_country=PK&currency=EUR&measurement_system=METRIC&design_id='+str(item)+'&page_locale=en'

                print(price_url)

                urls.append(price_url)

        for url in urls:

            tasks.append(asyncio.create_task(get_Fabric_Pricing_Data(session, url)))

        results = await asyncio.gather(*tasks)

        print(len(design_Name))

        print(len(creator_Name))

        print(len(results))

        return design_Name, creator_Name, results

if __name__ == '__main__':

    loop = asyncio.get_event_loop()

    fabrics = get_fabric_names()[0:-3]

    design_Name, creator_Name, results = loop.run_until_complete(main())

    #print(creator_Name)

    for fab in fabrics:
        print(fab)
    for name, creator in zip(design_Name, creator_Name):
        for fab_type in fabrics:
            Design_Name = name
            Creator_Name = creator
            test_swatch_meter = results[1]
            fat_quarter_meter = results[2]
            meter = results[3]

            if (name, creator) not in items_dict.keys():
                items_dict[(name, creator)] = {}
            itemCount = len(items_dict[(name, creator)].values()) / 4
            items_dict[(name, creator)].update({'fabric_name_%02d' %itemCount: fab_type,
            'test_swatch_meter_%02d' %itemCount: test_swatch_meter,
            'fat_quarter_meter_%02d' %itemCount: fat_quarter_meter,
            'meter_%02d' %itemCount: meter})
df = pd.DataFrame.from_dict(items_dict, orient='index').reset_index(drop=False)
df = df.rename(columns={'level_0':'designName','level_1':'screenName'})
df.to_csv('scraped_data.csv', index=False)

but cannot get it to format I am looking for like the given spreadsheet.
Any help would be much appreciated I am trying for almost a week but no luck.
Thanks in Advance.

This topic was automatically closed 182 days after the last reply. New replies are no longer allowed.