You need to parse your data from the script tag rather than the spans and divs.
Try this:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
from pandas import json_normalize
import json
def get_page(url):
response = requests.get(url)
if not response.ok:
print('server responded:', response.status_code)
else:
soup = BeautifulSoup(response.text, 'lxml')
return soup
def get_detail_data(url):
res = requests.get(url)
soup = BeautifulSoup(res.content, "lxml")
raw = res.text.split("<script> window.__INITIAL_STATE__=")[1]
raw = raw.split("</script>")[0]
data = json.loads(raw)
data = json.loads(data)
cols = ['abn', 'address', 'name', 'primary_location', 'service_area', 'state', 'suburb', 'website']
df = pd.DataFrame(data["sites"]["list"]).T
df = df[cols].reset_index(drop=True)
primary_location = json_normalize(df.primary_location[0])
df = pd.concat([df, primary_location], axis=1)
to_drop = ["primary_location", "is_primary", "suburb_seo_key", "capital_city_seo_key"]
df.drop(to_drop, axis=1, inplace=True)
return df
def get_index_data(soup):
titles = []
for item in soup.findAll("h3", {'class': 'sc-bZQynM sc-iwsKbI dpKmnV'}):
urls = (f"https://hipages.com.au{item.previous_element.get('href')}")
titles.append(urls)
return titles
def Main():
mainurl = "https://hipages.com.au/find/antenna_services/nsw/sydney"
main_titles = get_index_data(get_page(mainurl))
final_data = []
for title in main_titles:
data = get_detail_data(title)
final_data.append(data)
return final_data
data = Main()
df = pd.concat(data).reset_index(drop=True)
display(df)
This gives you much more detailed data by the way.
7
solved How to scrape multiple result having same tags and class