# SAVE ZOOPLA LISTINGS TO CSV
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
import sys
default_url = 'https://www.zoopla.co.uk/for-sale/houses/SE5/?beds_min=1&is_auction=false&is_retirement_home=false&is_shared_ownership=false&price_max=250000&price_min=90000&q=SE5&radius=10&results_sort=newest_listings&search_source=home&page_size=100'
if len(sys.argv) <= 1:
my_url = default_url
else:
my_url = sys.argv[1]
# open and read page
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
# Grabs each listing
# Finds all div.item-container elements (these are listings)
containers = page_soup.findAll("div",{"class":"ListingsContainer"})
# Get list details to see if there are more pages
list_details = page_soup.find("span",{"class":"listing-results-utils-count"});
number_of_listings = int(list_details.text.partition("of ")[2]);
# prep output file
from datetime import date
today_string = date.today().strftime('%y%m%d')
# e.g. 200619 for 19/06/2020, and prefix this to the filename
filename = today_string + "-zoopla-listings.csv"
f = open(filename, "w")
headers = "Link, Title, Price, Modifier, Agent, Agent Tel, Address, Nearby, All Attributes, Bedrooms, Bathrooms, Reception Rooms, Sq Ft\n"
f.write(headers)
# loop through containers to grab info of interest
for container in containers:
listing_price = container.find("a",{"class":"listing-results-price"})
link = "https://www.zoopla.co.uk" + listing_price["href"]
#modifier must come first because of extract
modifier = listing_price.span.extract().text.strip()
price = listing_price.text.strip()
title = container.find("h2",{"class":"listing-results-attr"})
title = " ".join((title.text.split()))
agent = container.find("div",{"class":"agent_logo"}).img["alt"]
agent_phone = container.find("span",{"class":"agent_phone"}).a.span.text.strip()
nearby_stations_schools = container.find("div",{"class":"nearby_stations_schools"}).text
nearby_stations_schools =" ".join(nearby_stations_schools.split())
prop_address = container.find("a",{"class":"listing-results-address"}).text
# Property Attributes (Bedrooms, Bathrooms, Reception rooms, etc.)
attributes = container.find("h3",{"class":"listing-results-attr"})
bedrooms = attributes.find("span",{"class":"num-beds"})
bathrooms = attributes.find("span",{"class":"num-baths"})
reception_rooms = attributes.find("span",{"class":"num-reception"})
num_sqft = attributes.find("span",{"class":"num-sqft"})
att_list = [bedrooms, bathrooms, reception_rooms, num_sqft];
# Getting text out of the property attributes, or 'nan' for the non-existent ones
def TextOrNan(n):
if n != None:
return str(n.text)
else:
return 'nan'
att_temp = map(TextOrNan,att_list)
att_temp = list(att_temp)
bedrooms = att_temp[0]
bathrooms = att_temp[1]
reception_rooms = att_temp[2]
num_sqft = att_temp[3]
# Generating the attribute summary string
attributes_select = attributes.select('h3 > span')
attribute_list = [];
for attr in attributes_select:
try:
attribute_list.append(attr["title"])
except:
print()
# carry on where title is missing
attribute_string = " ".join(attribute_list)
# Print to console
infos = ["Link", "Title", "Price", "Modifier", "Agent", "Agent Tel", "Address", "Nearby", "All Attributes", "Bedrooms", "Bathrooms", "Reception Rooms", "Sq Ft"];
data = [link, title, price, modifier, agent, agent_phone, prop_address, nearby_stations_schools, attribute_string, bedrooms, bathrooms, reception_rooms, num_sqft];
i = 0
for info in infos:
print(info + ": " + str(data[i]))
i += 1
# Write to CSV
f.write(link + ","\
+ title.replace(",","|") + ","\
+ price.replace(",","k") + ","\
+ modifier.replace(",","|") + ","\
+ agent.replace(",","|") + ","\
+ agent_phone.replace(",","|") + ","\
+ prop_address.replace(",","|") + ","\
+ nearby_stations_schools.replace(",","|") + ","\
+ attribute_string.replace(",","|") + ","\
+ bedrooms + ","\
+ bathrooms + ","\
+ reception_rooms + ","\
+ num_sqft + "\n")
# the .replace() function removes commas, as this will otherwise create new columns in the csv file
f.close()
print("Number of Listings: "+ str(number_of_listings))
print("Number Saved: " + str(len(containers)))
print("If these numbers differ, you may need to go to the next page and collect more.")