爬取学堂在线的计算机类课程页面内容。
要求将课程名称、老师、所属学校和选课人数信息,保存到一个csv文件中。
链接:
importscrapy
classStudyhallItem(scrapy.Item):
#definethefieldsforyouritemherelike:
#name=scrapy.Field()
name=scrapy.Field()#课程名
teacher=scrapy.Field()#老师
school=scrapy.Field()#学校
peopleNum=scrapy.Field()#选课人数
pass
ITEM_PIPELINES=
fromitemadapterimportItemAdapter
importcsv
classStudyhallPipeline(object):
defopen_spider(self,spider):
try:
"""打开csv文件"""
self.file=open('StudyHallData.csv','w',encoding='utf-8',newline='')
self.csv=csv.writer(self.file)
exceptExceptionase:
print(e)
defprocess_item(self,item,spider):
self.csv.writerow(list(item.values()))
returnitem
defclose_spider(self,spider):
self.file.close()
importjson
frompprintimportpprint
fromstudyHall.itemsimportStudyhallItem
classstudyHallSpider(scrapy.spiders.Spider):
name='studyHall'
allowed_domains=['www.xuetangx.com/']
"""data从浏览器中查看得来"""
data='{"query":"","chief_org":[],"classify":["1"],"selling_type":[],"status":[],"appid":10000}'
"""标头从浏览器中复制"""
headers={
'Host':'www.xuetangx.com',
'authority':'www.xuetangx.com',
'method':'POST',
'path':'/api/v1/lms/get_product_list/page=1',
'accept':'application/json,text/plain,*/*',
'accept-encoding':'gzip,deflate,br',
'accept-language':'zh',
'content-type':'application/json',
'cookie':'_ga=GA1.2.192047866.1605620269;provider=xuetang;django_language=zh',
'django-language':'zh',
'sec-fetch-dest':'empty',
'sec-fetch-mode':'cors',
'sec-fetch-site':'same-origin',
'user-agent':'Mozilla/5.0(WindowsNT10.0;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/86.0.4240.198Safari/537.36Edg/86.0.622.69',
'x-client':'web',
'xtbz':'xt'
}
defstart_requests(self):
"""使用start_requests创建post请求"""
forpageinrange(1,6):
"""爬取5页信息"""
yieldscrapy.FormRequest(
url=self.url_pat.format(page),
headers=self.headers,
method='POST',
body=self.data,
callback=self.parse
)
defparse(self,response):
msg=json.loads(response.body)
foreachinmsg['data']['product_list']:
item=StudyhallItem()
item['name']=each['name']
item['school']=each['org']['name']
item['peopleNum']=each['count']
teacherList=[]
#因为有些课程有多个老师,需要逐一保存,写入一条记录
forteacherineach['teacher']:
teacherList.append(teacher['name'])
item['teacher']=','.join(teacherList)
yielditem
截取一部分爬取的数据如下:
要求爬取北京市东城、西城、海淀和朝阳四个城区的数据(每个区爬取5页),将楼盘名称、总价、平米数、单价保存到json文件中。
classHomelinkItem(scrapy.Item):
location=scrapy.Field()#城区
name=scrapy.Field()#名称
area=scrapy.Field()#面积
TTprice=scrapy.Field()#总价
UnitPrice=scrapy.Field()#单价
BOT_NAME='homeLink'
SPIDER_MODULES=['homeLink.spiders']
NEWSPIDER_MODULE='homeLink.spiders'
ITEM_PIPELINES={'homeLink.pipelines.HomelinkPipeline':300,}
#Crawlresponsiblybyidentifyingyourself(andyourwebsite)ontheuser-agent
#Obeyrobots.txtrules
ROBOTSTXT_OBEY=False
"""对付反爬虫"""
importrandom
#useragent列表
USER_AGENT_LIST=[
'MSIE(MSIE6.0;X11;Linux;i686)Opera7.23',
'Opera/9.20(Macintosh;IntelMacOSX;U;en)',
'Opera/9.0(Macintosh;PPCMacOSX;U;en)',
'iTunes/9.0.3(Macintosh;U;IntelMacOSX10_6_2;en-ca)',
'Mozilla/4.76[en_jp](X11;U;SunOS5.8sun4u)',
'iTunes/4.2(Macintosh;U;PPCMacOSX10.2)',
'Mozilla/5.0(Macintosh;IntelMacOSX10.6;rv:5.0)Gecko/20100101Firefox/5.0',
'Mozilla/5.0(Macintosh;IntelMacOSX10.6;rv:9.0)Gecko/20100101Firefox/9.0',
'Mozilla/5.0(Macintosh;IntelMacOSX10.8;rv:16.0)Gecko/20120813Firefox/16.0',
'Mozilla/4.77[en](X11;I;IRIX;646.5IP30)',
'Mozilla/4.8[en](X11;U;SunOS;5.7sun4u)'
]
#随机生成useragent
USER_AGENT=random.choice(USER_AGENT_LIST)
fromhomeLink.itemsimportHomelinkItem
classMySpider(scrapy.Spider):
name='homeLink'
allowed_domains=['bj.lianjia.com']
start_urls=[]
locations=['dongcheng','xicheng','haidian','chaoyang']#四个城区
forlocinlocations:
"""东城前五页"""
start_urls.append(url)
item=HomelinkItem()
foreachinresponse.xpath("//div[@id='content']/div[1]/ul/*"):
"""爬取该路径下的房区信息"""
"""楼盘地理位置"""
if(response.url.split('/')[-3]=='dongcheng'):
item['location']='东城'
elif(response.url.split('/')[-3]=='xicheng'):
item['location']='西城'
elif(response.url.split('/')[-3]=='haidian'):
item['location']='海淀'
elif(response.url.split('/')[-3]=='chaoyang'):
item['location']='朝阳'
"""获取楼盘名称"""
item['name']=each.xpath("./div[1]/div[@class='title']/a/text()").extract()
"""获取楼盘面积/平米"""
item['area']=each.xpath("./div[1]/div[@class='address']/div/text()").extract()[0].split('|')[1]
"""获取楼盘总价,以万为单位结尾"""
item['TTprice']=str(each.xpath("./div[1]/div[@class='priceInfo']/div[1]/span/text()").extract()[0])+'万'
"""获取楼盘单价"""
item['UnitPrice']=each.xpath("./div[1]/div[@class='priceInfo']/div[2]/span/text()").extract()
if(item['name']anditem['area']anditem['TTprice']anditem['UnitPrice']):