12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697 |
- # -*- coding: utf-8 -*-
- import scrapy
- import MySQLdb
- import time
- import re
- class MeunspiderSpider(scrapy.Spider):
- name = 'menuSpider'
- allowed_domains = ['www.xiachufang.com']
- start_urls = [
- 'http://www.xiachufang.com/explore/',
- 'http://www.xiachufang.com/explore/rising/',
- 'http://www.xiachufang.com/explore/head/',
- 'http://www.xiachufang.com/explore/honor/',
- 'http://www.xiachufang.com/explore/monthhonor/',
- 'http://www.xiachufang.com/explore/created/',
- 'http://www.xiachufang.com/explore/menu/pop/',
- ]
- def parse(self, response):
- filename = response.url.split("/")[-2]
- for quote in response.xpath("//ul[@class='list']/li"):
- linkurl=(quote.xpath(".//a/@href").extract_first())
- title=(quote.xpath(".//div[@class='info pure-u']/p[@class='name']/a/text()").extract_first())
- if(title !=None):
- title=(title.strip().replace('\n',''))
- outerdescrib=(quote.xpath(".//p[@class='ing ellipsis']").extract_first())
- dr = re.compile(r'<[^>]+>',re.S)
- outerdescrib = dr.sub('',outerdescrib)
- if linkurl is not None:
- yield scrapy.Request(response.urljoin(linkurl),meta={'filename':filename,'linkurl': linkurl,'title':title,'outerdescrib':outerdescrib},callback=self.doparse)
- # yield {
- # 'linkurl':linkurl,
- # 'title':title,
- # 'outerdescrib':outerdescrib
- # }
- next_page_url = response.xpath('//a[@class="next"]/@href').extract_first()
- if next_page_url is not None:
- yield scrapy.Request(response.urljoin(next_page_url),callback=self.parse)
- def doparse(self,response):
- # db = MySQLdb.connect("127.0.0.1","root","123456","menu",use_unicode=True,charset="utf8")
- db = MySQLdb.connect("127.0.0.1","root","Coupon@123","caipu",use_unicode=True,charset="utf8")
- cursor = db.cursor()
- title = response.meta['title']
- outerdescrib = response.meta['outerdescrib']
- filename = response.meta['filename']
- figure=response.xpath("//div[@class='cover image expandable block-negative-margin']/img/@src").extract_first()
- describ=response.xpath("//div[@itemprop='description']/text()").extract_first()
- if(describ!='None' and describ != None and describ!=''):
- describ=describ.strip().replace('\n','')
- materials={}
- a={}
- for mater in response.xpath("//tr[@itemprop='recipeIngredient']"):
- sindex=''
- if(mater.xpath(".//td[@class='name']/text()").extract_first().strip().replace('\n','')==''):
- sindex=mater.xpath(".//td[@class='name']/a/text()").extract_first()
- else:
- sindex=mater.xpath(".//td[@class='name']/text()").extract_first()
- sindex=sindex.strip().replace('\n','')
- materials[sindex]=mater.xpath("./td[@class='unit']/text()").extract_first().strip().replace('\n','')
- created_at=response.xpath("//span[@itemprop='datePublished']/text()").extract_first().strip().replace('\n','')
- #转换成时间数组
- timeArray = time.strptime(created_at, "%Y-%m-%d %H:%M:%S")
- #转换成时间戳
- timestamp = time.mktime(timeArray)
- materialsStr=''
- for key in materials:
- materialsStr=materialsStr+key+","+materials[key]+";"
- materials=materialsStr.rstrip(';')
- sql = "INSERT INTO menu(title,tags,intro,ingredients,url,burden,created_at,origin,status) VALUES ('%s','%s','%s','%s','%s' ,'%s' ,'%s' ,'%s' ,'%s' )"%(title,outerdescrib,describ,materials,figure,' ',timestamp,filename,1)
- try:
- # # 执行sql语句
- cursor.execute(sql)
- # 提交到数据库执行
- db.commit()
- tag_id = int(cursor.lastrowid)
- except:
- # # Rollback in case there is any error
- tag_id = 1
- db.rollback()
- for step in response.xpath("//li[@class='container']"):
- stepSql="INSERT INTO menu_step(url,menu_id,step) VALUES ( '%s','%s','%s' )"%(step.xpath("./img/@src").extract_first(),tag_id,step.xpath("./p[@class='text']/text()").extract_first().strip())
- # cursor.execute(stepSql)
- # # 提交到数据库执行
- # db.commit()
- try:
- # # 执行sql语句
- cursor.execute(stepSql)
- # 提交到数据库执行
- db.commit()
- except:
- # # Rollback in case there is any error
- db.rollback()
- # 关闭数据库连接
- db.close()
|