123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100 |
- # -*- coding: utf-8 -*-
- import scrapy
- import MySQLdb
- import time
- import re
- class MeunspiderSpider(scrapy.Spider):
- name = 'meunSpiderEveryday'
- allowed_domains = ['www.xiachufang.com']
- start_urls = [
- 'http://www.xiachufang.com/explore/',
- 'http://www.xiachufang.com/explore/rising/',
- 'http://www.xiachufang.com/explore/head/',
- 'http://www.xiachufang.com/explore/honor/',
- 'http://www.xiachufang.com/explore/monthhonor/',
- 'http://www.xiachufang.com/explore/created/',
- 'http://www.xiachufang.com/explore/menu/pop/',
- ]
- def parse(self, response):
- # print(response.body)
- filename = response.url.split("/")[-2]
- for quote in response.xpath("//ul[@class='list']/li"):
- linkurl=(quote.xpath(".//a/@href").extract_first())
- title=(quote.xpath(".//div[@class='info pure-u']/p[@class='name']/a/text()").extract_first())
- if(title !=None):
- title=(title.strip().replace('\n',''))
- outerdescrib=(quote.xpath(".//p[@class='ing ellipsis']").extract_first())
- dr = re.compile(r'<[^>]+>',re.S)
- outerdescrib = dr.sub('',outerdescrib)
- # if(outerdescrib !=None):
- # outerdescrib=(outerdescrib.strip().replace('\n',''))
- if linkurl is not None:
- yield scrapy.Request(response.urljoin(linkurl),meta={'filename':filename,'linkurl': linkurl,'title':title,'outerdescrib':outerdescrib},callback=self.doparse)
- # yield {
- # 'linkurl':linkurl,
- # 'title':title,
- # 'outerdescrib':outerdescrib
- # }
- #next_page_url = response.xpath('//a[@class="next"]/@href').extract_first()
- #if next_page_url is not None:
- # yield scrapy.Request(response.urljoin(next_page_url),callback=self.parse)
- def doparse(self,response):
- db = MySQLdb.connect("127.0.0.1","root","Coupon@123","caipu",use_unicode=True,charset="utf8")
- # db = MySQLdb.connect("127.0.0.1","root","123456","menu",use_unicode=True,charset="utf8")
- cursor = db.cursor()
- title = response.meta['title']
- outerdescrib = response.meta['outerdescrib']
- filename = response.meta['filename']
- figure=response.xpath("//div[@class='cover image expandable block-negative-margin']/img/@src").extract_first()
- describ=response.xpath("//div[@itemprop='description']/text()").extract_first()
- if(describ!='None' and describ != None and describ!=''):
- describ=describ.strip().replace('\n','')
- materials={}
- a={}
- for mater in response.xpath("//tr[@itemprop='recipeIngredient']"):
- sindex=''
- if(mater.xpath(".//td[@class='name']/text()").extract_first().strip().replace('\n','')==''):
- sindex=mater.xpath("./td[@class='name']/a/text()").extract_first()
- else:
- sindex=mater.xpath(".//td[@class='name']/text()").extract_first()
- sindex=sindex.strip().replace('\n','')
- materials[sindex]=mater.xpath(".//td[@class='unit']/text()").extract_first().strip().replace('\n','')
- created_at=response.xpath("//span[@itemprop='datePublished']/text()").extract_first().strip().replace('\n','')
- #转换成时间数组
- timeArray = time.strptime(created_at, "%Y-%m-%d %H:%M:%S")
- #转换成时间戳
- timestamp = time.mktime(timeArray)
- materialsStr=''
- for key in materials:
- materialsStr=materialsStr+key+","+materials[key]+";"
- materials=materialsStr.rstrip(';')
- sqlone = "SELECT * FROM menu WHERE title = '%s'" % (title)
- try:
- # 执行SQL语句
- cursor.execute(sqlone)
- # 获取所有记录列表
- results=cursor.fetchall()
- if(len(results)==0):
- sql = "INSERT INTO menu(title,tags,intro,ingredients,url,burden,created_at,origin,status) VALUES ('%s','%s','%s','%s','%s' ,'%s' ,'%s' ,'%s' ,'%s' )"%(title,outerdescrib,describ,materials,figure,' ',timestamp,filename,1)
- try:
- cursor.execute(sql)
- db.commit()
- tag_id = int(cursor.lastrowid)
- except:
- tag_id = 1
- db.rollback()
- for step in response.xpath("//li[@class='container']"):
- stepSql="INSERT INTO menu_step(url,menu_id,step) VALUES ( '%s','%s','%s' )"%(step.xpath("./img/@src").extract_first(),tag_id,step.xpath("./p[@class='text']/text()").extract_first().strip())
- try:
- cursor.execute(stepSql)
- db.commit()
- except:
- db.rollback()
- except:
- print(1)
-
- # 关闭数据库连接
- db.close()
|