Hi All
I am really new to coding but enjoying learning something completely different from my usual job (Paramedic). I am looking to make web scraper for the UK property website Rightmove.
I have followed the guidance from this project (https://github.com/toby-p/rightmove_webscraper.py) but currently a bit stuck on point 4 (Create an instance of the class on the URL).
I have tried to email the author but they appear unavailable and I have searched for other guidance on the web but this has confused me further. Would anybody be able to provide me with some guidance about how I complete this step at all?
Sincere thanks.
import datetime as dt
from lxml import html, etree
import numpy as np
import pandas as pd
import requests
class RightmoveData:
"""The `RightmoveData` web scraper collects structured data on properties
returned by a search performed on www.rightmove.co.uk
An instance of the class provides attributes to access data from the search
results, the most useful being `get_results`, which returns all results as a
Pandas DataFrame object.
The query to rightmove can be renewed by calling the `refresh_data` method.
"""
def __init__(self, url: str, get_floorplans: bool = False):
"""Initialize the scraper with a URL from the results of a property
search performed on www.rightmove.co.uk.
Args:
url (str): full HTML link to a page of rightmove search results.
get_floorplans (bool): optionally scrape links to the individual
floor plan images for each listing (be warned this drastically
increases runtime so is False by default).
"""
self.__status_code, self.__first_page = self._request(url)
self.__url = url
self.__validate_url()
self.__results = self.__get_results(get_floorplans=get_floorplans)
@staticmethod
def _request(url: str):
r = requests.get(url)
return r.status_code, r.content
def refresh_data(self, url: str = None, get_floorplans: bool = False):
"""Make a fresh GET request for the rightmove data.
Args:
url (str): optionally pass a new HTML link to a page of rightmove
search results (else defaults to the current `url` attribute).
get_floorplans (bool): optionally scrape links to the individual
floor plan images for each listing (this drastically increases
runtime so is False by default).
"""
url = self.url if not url else url
self.__status_code, self.__first_page = self._request(url)
self.__url = url
self.__validate_url()
self.__results = self.__get_results(get_floorplans=get_floorplans)
def __validate_url(self):
"""Basic validation that the URL at least starts in the right format and
returns status code 200."""
real_url = "{}://www.rightmove.co.uk/{}/find.html?"
protocols = ["http", "https"]
types = ["property-to-rent", "property-for-sale", "new-homes-for-sale"]
urls = [real_url.format(p, t) for p in protocols for t in types]
conditions = [self.url.startswith(u) for u in urls]
conditions.append(self.__status_code == 200)
if not any(conditions):
raise ValueError(f"Invalid rightmove search URL:\n\n\t{self.url}")
@property
def url(self):
return self.__url
@property
def get_results(self):
"""Pandas DataFrame of all results returned by the search."""
return self.__results
@property
def results_count(self):
"""Total number of results returned by `get_results`. Note that the
rightmove website may state a much higher number of results; this is
because they artificially restrict the number of results pages that can
be accessed to 42."""
return len(self.get_results)
@property
def average_price(self):
"""Average price of all results returned by `get_results` (ignoring
results which don't list a price)."""
total = self.get_results["price"].dropna().sum()
return total / self.results_count
def summary(self, by: str = "number_bedrooms"):
"""Pandas DataFrame summarising the the results by mean price and count.
By default grouped by the `number_bedrooms` column but will accept any
column name from `get_results` as a grouper.
Args:
by (str): valid column name from `get_results` DataFrame attribute.
"""
assert by in self.get_results.columns, f"Column not found in `get_results`: {by}"
df = self.get_results.dropna(axis=0, subset=["price"])
groupers = {"price": ["count", "mean"]}
df = df.groupby(df[by]).agg(groupers)
df.columns = df.columns.get_level_values(1)
df.reset_index(inplace=True)
if "number_bedrooms" in df.columns:
df["number_bedrooms"] = df["number_bedrooms"].astype(int)
df.sort_values(by=["number_bedrooms"], inplace=True)
else:
df.sort_values(by=["count"], inplace=True, ascending=False)
return df.reset_index(drop=True)
@property
def rent_or_sale(self):
"""String specifying if the search is for properties for rent or sale.
Required because the Xpaths are different for the target elements."""
if "/property-for-sale/" in self.url or "/new-homes-for-sale/" in self.url:
return "sale"
elif "/property-to-rent/" in self.url:
return "rent"
else:
raise ValueError(f"Invalid rightmove URL:\n\n\t{self.url}")
@property
def results_count_display(self):
"""Returns an integer of the total number of listings as displayed on
the first page of results. Note that not all listings are available to
scrape because rightmove limits the number of accessible pages."""
tree = html.fromstring(self.__first_page)
xpath = """//span[@class="searchHeader-resultCount"]/text()"""
return int(tree.xpath(xpath)[0].replace(",", ""))
@property
def page_count(self):
"""Returns the number of result pages returned by the search URL. There
are 24 results per page. Note that the website limits results to a
maximum of 42 accessible pages."""
page_count = self.results_count_display // 24
if self.results_count_display % 24 > 0:
page_count += 1
# Rightmove will return a maximum of 42 results pages, hence:
if page_count > 42:
page_count = 42
return page_count
def __get_page(self, request_content, get_floorplans=False):
"""Method to scrape data from a single page of search results. Used
iteratively by the `get_results` method to scrape data from every page
returned by the search."""
# Process the html:
tree = html.fromstring(request_content)
# Set xpath for price:
if self.rent_or_sale == "rent":
xp_prices = """//span[@class="propertyCard-priceValue"]/text()"""
elif self.rent_or_sale == "sale":
xp_prices = """//div[@class="propertyCard-priceValue"]/text()"""
else:
raise ValueError("Invalid URL format.")
# Set xpaths for listing title, property address, URL, and agent URL:
xp_titles = """//div[@class="propertyCard-details"]\
//a[@class="propertyCard-link"]\
//h2[@class="propertyCard-title"]/text()"""
xp_addresses = """//address[@class="propertyCard-address"]//span/text()"""
xp_weblinks = """//div[@class="propertyCard-details"]//a[@class="propertyCard-link"]/@href"""
xp_agent_urls = """//div[@class="propertyCard-contactsItem"]\
//div[@class="propertyCard-branchLogo"]\
//a[@class="propertyCard-branchLogo-link"]/@href"""
# Create data lists from xpaths:
price_pcm = tree.xpath(xp_prices)
titles = tree.xpath(xp_titles)
addresses = tree.xpath(xp_addresses)
base = "http://www.rightmove.co.uk"
weblinks = [f"{base}{tree.xpath(xp_weblinks)[w]}" for w in range(len(tree.xpath(xp_weblinks)))]
agent_urls = [f"{base}{tree.xpath(xp_agent_urls)[a]}" for a in range(len(tree.xpath(xp_agent_urls)))]
# Optionally get floorplan links from property urls (longer runtime):
floorplan_urls = list() if get_floorplans else np.nan
if get_floorplans:
for weblink in weblinks:
status_code, content = self._request(weblink)
if status_code != 200:
continue
tree = html.fromstring(content)
xp_floorplan_url = """//*[@id="floorplanTabs"]/div[2]/div[2]/img/@src"""
floorplan_url = tree.xpath(xp_floorplan_url)
if floorplan_url:
floorplan_urls.append(floorplan_url[0])
else:
floorplan_urls.append(np.nan)
# Store the data in a Pandas DataFrame:
data = [price_pcm, titles, addresses, weblinks, agent_urls]
data = data + [floorplan_urls] if get_floorplans else data
temp_df = pd.DataFrame(data)
temp_df = temp_df.transpose()
columns = ["price", "type", "address", "url", "agent_url"]
columns = columns + ["floorplan_url"] if get_floorplans else columns
temp_df.columns = columns
# Drop empty rows which come from placeholders in the html:
temp_df = temp_df[temp_df["address"].notnull()]
return temp_df
def __get_results(self, get_floorplans=False):
"""Build a Pandas DataFrame with all results returned by the search."""
results = self.__get_page(self.__first_page, get_floorplans=get_floorplans)
# Iterate through all pages scraping results:
for p in range(1, self.page_count + 1, 1):
# Create the URL of the specific results page:
p_url = f"{str(self.url)}&index={p * 24}"
# Make the request:
status_code, content = self._request(p_url)
# Requests to scrape lots of pages eventually get status 400, so:
if status_code != 200:
break
# Create a temporary DataFrame of page results:
temp_df = self.__get_page(content, get_floorplans=get_floorplans)
# Concatenate the temporary DataFrame with the full DataFrame:
frames = [results, temp_df]
results = pd.concat(frames)
# Reset the index:
results.reset_index(inplace=True, drop=True)
# Convert price column to numeric type:
results["price"].replace(regex=True, inplace=True, to_replace=r"\D", value=r"")
results["price"] = pd.to_numeric(results["price"])
# Extract postcodes to a separate column:
pat = r"\b([A-Za-z][A-Za-z]?[0-9][0-9]?[A-Za-z]?)\b"
results["postcode"] = results["address"].astype(str).str.extract(pat, expand=True)
# Extract number of bedrooms from `type` to a separate column:
pat = r"\b([\d][\d]?)\b"
results["number_bedrooms"] = results["type"].astype(str).str.extract(pat, expand=True)
results.loc[results["type"].str.contains("studio", case=False), "number_bedrooms"] = 0
results["number_bedrooms"] = pd.to_numeric(results["number_bedrooms"])
# Clean up annoying white spaces and newlines in `type` column:
results["type"] = results["type"].str.strip("\n").str.strip()
# Add column with datetime when the search was run (i.e. now):
now = dt.datetime.today()
results["search_date"] = now
return results