import requests
from bs4 import BeautifulSoup
def get_html(url):
resp = requests.get(url)
return BeautifulSoup(resp.content, 'html.parser')
home_html = get_html("https://www.cars-data.com/")
brand_as = home_html.select(".brands_plus_most .row .col-5 .row.center div.col-2 a")
brand_links = [a["href"] for a in brand_as]
len(brand_links)
89
from ipywidgets import FloatProgress
from IPython.display import display
%%time
bar = FloatProgress(min=0, max=len(brand_links))
display(bar)
seriess_links = []
for brand_link in brand_links:
brand_html = get_html(brand_link)
series_as = brand_html.select("section.models div.col-4 a")
series_links = [a["href"] for a in series_as]
seriess_links += series_links
bar.value += 1
CPU times: user 4.95 s, sys: 258 ms, total: 5.21 s Wall time: 1min 12s
len(seriess_links)
1670
seriess_links = set(seriess_links)
len(seriess_links)
1670
%%time
bar = FloatProgress(min=0, max=len(seriess_links))
display(bar)
models_links = []
for series_link in seriess_links:
series_html = get_html(series_link)
model_as = series_html.select("section.models div.col-4 a")
model_links = [a["href"] for a in model_as]
models_links += model_links
bar.value += 1
len(models_links)
models_links = set(models_links)
len(models_links)
list(models_links)[:3]
%%time
bar = FloatProgress(min=0, max=len(models_links))
display(bar)
variants_links = []
for model_link in models_links:
model_html = get_html(model_link)
variant_as = model_html.select("section.types div.col-8 div.row div.col-6 h2 a")
variant_links = [a["href"] for a in variant_as]
variants_links += variant_links
bar.value += 1
len(variants_links)
variants_links = list(set(variants_links))
len(variants_links)
variants_links.remove("")
variants_links[:3]
from itertools import zip_longest
def grouper(iterable, n, fillvalue=None):
"""Collect data into fixed-length chunks or blocks
https://docs.python.org/3/library/itertools.html#itertools-recipes
"""
# grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx"
args = [iter(iterable)] * n
return zip_longest(*args, fillvalue=fillvalue)
%%time
bar = FloatProgress(min=0, max=len(variants_links))
display(bar)
datas = []
for variant_link in variants_links:
variant_html = get_html(variant_link)
data = {"url": variant_link}
breadcrumb = variant_html.select("#breadcrumb")
breadcrumbs = [el.text.strip() for el in variant_html.select("#breadcrumb a")]
_, data["make"], data["model"], data["series"], data["variant"] = breadcrumbs
dts = variant_html.select("dt")
dds = variant_html.select("dd")
for dt, dd in zip(dts, dds):
data[dt.text.strip(":")] = dd.text
datas.append(data)
bar.value += 1
len(datas)
import pandas as pd
%%time
df = pd.DataFrame(datas)
df.iloc[0]
df.info(memory_usage="deep")
%%time
df.to_csv("card_data_tabs.csv", sep="\t", encoding="utf-8")
list(df.columns)