Python作业1：Scrapy爬取学堂在线+链家二手房EEthunder|链家在线_房产

爬取学堂在线的计算机类课程页面内容。

要求将课程名称、老师、所属学校和选课人数信息，保存到一个csv文件中。

链接：

importscrapy

classStudyhallItem(scrapy.Item):

#definethefieldsforyouritemherelike:

#name=scrapy.Field()

name=scrapy.Field()#课程名

teacher=scrapy.Field()#老师

school=scrapy.Field()#学校

peopleNum=scrapy.Field()#选课人数

pass

ITEM_PIPELINES=

fromitemadapterimportItemAdapter

importcsv

classStudyhallPipeline(object):

defopen_spider(self,spider):

try:

"""打开csv文件"""

self.file=open('StudyHallData.csv','w',encoding='utf-8',newline='')

self.csv=csv.writer(self.file)

exceptExceptionase:

print(e)

defprocess_item(self,item,spider):

self.csv.writerow(list(item.values()))

returnitem

defclose_spider(self,spider):

self.file.close()

importjson

frompprintimportpprint

fromstudyHall.itemsimportStudyhallItem

classstudyHallSpider(scrapy.spiders.Spider):

name='studyHall'

allowed_domains=['www.xuetangx.com/']

"""data从浏览器中查看得来"""

data='{"query":"","chief_org":[],"classify":["1"],"selling_type":[],"status":[],"appid":10000}'

"""标头从浏览器中复制"""

headers={

'Host':'www.xuetangx.com',

'authority':'www.xuetangx.com',

'method':'POST',

'path':'/api/v1/lms/get_product_list/page=1',

'accept':'application/json,text/plain,*/*',

'accept-encoding':'gzip,deflate,br',

'accept-language':'zh',

'content-type':'application/json',

'cookie':'_ga=GA1.2.192047866.1605620269;provider=xuetang;django_language=zh',

'django-language':'zh',

'sec-fetch-dest':'empty',

'sec-fetch-mode':'cors',

'sec-fetch-site':'same-origin',

'user-agent':'Mozilla/5.0(WindowsNT10.0;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/86.0.4240.198Safari/537.36Edg/86.0.622.69',

'x-client':'web',

'xtbz':'xt'

}

defstart_requests(self):

"""使用start_requests创建post请求"""

forpageinrange(1,6):

"""爬取5页信息"""

yieldscrapy.FormRequest(

url=self.url_pat.format(page),

headers=self.headers,

method='POST',

body=self.data,

callback=self.parse

)

defparse(self,response):

msg=json.loads(response.body)

foreachinmsg['data']['product_list']:

item=StudyhallItem()

item['name']=each['name']

item['school']=each['org']['name']

item['peopleNum']=each['count']

teacherList=[]

#因为有些课程有多个老师，需要逐一保存，写入一条记录

forteacherineach['teacher']:

teacherList.append(teacher['name'])

item['teacher']=','.join(teacherList)

yielditem

截取一部分爬取的数据如下：

要求爬取北京市东城、西城、海淀和朝阳四个城区的数据（每个区爬取5页），将楼盘名称、总价、平米数、单价保存到json文件中。

classHomelinkItem(scrapy.Item):

location=scrapy.Field()#城区

name=scrapy.Field()#名称

area=scrapy.Field()#面积

TTprice=scrapy.Field()#总价

UnitPrice=scrapy.Field()#单价

BOT_NAME='homeLink'

SPIDER_MODULES=['homeLink.spiders']

NEWSPIDER_MODULE='homeLink.spiders'

ITEM_PIPELINES={'homeLink.pipelines.HomelinkPipeline':300,}

#Crawlresponsiblybyidentifyingyourself(andyourwebsite)ontheuser-agent

#Obeyrobots.txtrules

ROBOTSTXT_OBEY=False

"""对付反爬虫"""

importrandom

#useragent列表

USER_AGENT_LIST=[

'MSIE(MSIE6.0;X11;Linux;i686)Opera7.23',

'Opera/9.20(Macintosh;IntelMacOSX;U;en)',

'Opera/9.0(Macintosh;PPCMacOSX;U;en)',

'iTunes/9.0.3(Macintosh;U;IntelMacOSX10_6_2;en-ca)',

'Mozilla/4.76[en_jp](X11;U;SunOS5.8sun4u)',

'iTunes/4.2(Macintosh;U;PPCMacOSX10.2)',

'Mozilla/5.0(Macintosh;IntelMacOSX10.6;rv:5.0)Gecko/20100101Firefox/5.0',

'Mozilla/5.0(Macintosh;IntelMacOSX10.6;rv:9.0)Gecko/20100101Firefox/9.0',

'Mozilla/5.0(Macintosh;IntelMacOSX10.8;rv:16.0)Gecko/20120813Firefox/16.0',

'Mozilla/4.77[en](X11;I;IRIX;646.5IP30)',

'Mozilla/4.8[en](X11;U;SunOS;5.7sun4u)'

]

#随机生成useragent

USER_AGENT=random.choice(USER_AGENT_LIST)

fromhomeLink.itemsimportHomelinkItem

classMySpider(scrapy.Spider):

name='homeLink'

allowed_domains=['bj.lianjia.com']

start_urls=[]

locations=['dongcheng','xicheng','haidian','chaoyang']#四个城区

forlocinlocations:

"""东城前五页"""

start_urls.append(url)

item=HomelinkItem()

foreachinresponse.xpath("//div[@id='content']/div[1]/ul/*"):

"""爬取该路径下的房区信息"""

"""楼盘地理位置"""

if(response.url.split('/')[-3]=='dongcheng'):

item['location']='东城'

elif(response.url.split('/')[-3]=='xicheng'):

item['location']='西城'

elif(response.url.split('/')[-3]=='haidian'):

item['location']='海淀'

elif(response.url.split('/')[-3]=='chaoyang'):

item['location']='朝阳'

"""获取楼盘名称"""

item['name']=each.xpath("./div[1]/div[@class='title']/a/text()").extract()

"""获取楼盘面积/平米"""

item['area']=each.xpath("./div[1]/div[@class='address']/div/text()").extract()[0].split('|')[1]

"""获取楼盘总价，以万为单位结尾"""

item['TTprice']=str(each.xpath("./div[1]/div[@class='priceInfo']/div[1]/span/text()").extract()[0])+'万'

"""获取楼盘单价"""

item['UnitPrice']=each.xpath("./div[1]/div[@class='priceInfo']/div[2]/span/text()").extract()

if(item['name']anditem['area']anditem['TTprice']anditem['UnitPrice']):

THE END

Python作业1：Scrapy爬取学堂在线+链家二手房EEthunder

北京链家房地产经纪有限公司举办线上企业宣讲会

链家地产在线使用标准手册

链家地产官网智能改版在线找房子更加方便准确

链家网

Python作业1：Scrapy爬取学堂在线+链家二手房EEthunder

贝壳：把自己逼上绝路的人，没有对手链家左晖房地产平台化