58二手房
需求:
爬取58二手房数据(标题,价格)
如图所圈数据:
脚本:import requests
from lxml import etree
url="https://wx.58.com/ershoufang/"
headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36 Edg/88.0.705.74"}
page_text=requests.get(url,headers=headers).text
tree=etree.HTML(page_text)
section_list=tree.xpath('//section[@class="list"]')
div_list=section_list[0].xpath('./div')
file=open("./58二手房.txt","w",encoding="utf-8")
for div in div_list:
    title=div.xpath('./a/div[2]/div[1]/div[1]/h3/text()')[0]
    print(title)
    price_total=div.xpath('./a/div[2]/div[2]/p[1]//text()')
    price_total="".join(price_total)
    price_average=div.xpath('./a/div[2]/div[2]/p[2]/text()')[0]
    file.write(title+'\t'+price_total+'\t'+price_average+'\n')
file.close()
效果图:

彼岸图网
需求:
爬取彼岸图网4k风景图
脚本:import requests
import os
from lxml import etree
if not os.path.exists("./4k风景"):
    os.mkdir("./4k风景")
    
headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36 Edg/88.0.705.74"}
for i in range(1,6):
    if i==1:
        url="http://pic.netbian.com/4kfengjing/"
    else:
        url="http://pic.netbian.com/4kfengjing/index_{0}.html".format(i)
    response=requests.get(url,headers=headers)
    page_text=response.text
    tree=etree.HTML(page_text)
    li_list=tree.xpath('//ul[@class="clearfix"]/li')
    for li in li_list:
        img_src="http://pic.netbian.com"+li.xpath('./a/img/@src')[0]
        img_name=li.xpath('./a/img/@alt')[0]+".jpg"
        # 解决中文乱码问题的通法
        img_name=img_name.encode("iso-8859-1").decode("gbk")
        img_path="./4k风景/"+img_name
        img=requests.get(img_src,headers=headers).content
        with open(img_path,"wb") as file:
            file.write(img)
        print(img_name,"下载成功...")
爬取效果:

全国城市名称
需求:
爬取全国城市名称(https://www.aqistudy.cn/historydata/)
脚本:import requests
from lxml import etree
url="https://www.aqistudy.cn/historydata/"
headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36 Edg/88.0.705.74"}
page_text=requests.get(url,headers=headers).text
tree=etree.HTML(page_text)
city_name=tree.xpath('//div[@class="bottom"]/ul/li/a/text() | //div[@class="bottom"]/ul/div[2]/li/a/text()')
print(city_name,len(city_name))
效果图:
简历模板
需求:
下载站长素材中的免费简历模板(https://sc.chinaz.com/jianli/)
脚本:import requests
from lxml import etree
import os
if not os.path.exists("./简历模板"):
    os.mkdir("./简历模板")
    
headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36 Edg/88.0.705.74"}
for i in range(1,3):
    if i==1:
        url="https://sc.chinaz.com/jianli/index.html"
    else:
        url="https://sc.chinaz.com/jianli/index_{0}.html".format(i)
    page_text=requests.get(url,headers=headers).text
    tree=etree.HTML(page_text)
    div_list=tree.xpath('//div[@class="sc_warp  mt20"]/div/div/div')
    for j in div_list:
        href="https:"+j.xpath('./a/@href')[0]
        page_text=requests.get(href,headers=headers).text
        tree=etree.HTML(page_text)
        download=tree.xpath('//div[@class="clearfix mt20 downlist"]/ul/li[4]/a/@href')
        if len(download)==0:
            continue
        download=download[0]
        rar=requests.get(download,headers=headers).content
        rar_name=download.split("/")[-1]
        rar_path="./简历模板/"+rar_name
        with open(rar_path,"wb") as file:
            file.write(rar)
        print(rar_name,"下载成功...")
        
效果图:











