# -*- coding: utf-8 -*- import scrapy import MySQLdb import time import re class MeunspiderSpider(scrapy.Spider): name = 'meunSpiderEveryday' allowed_domains = ['www.xiachufang.com'] start_urls = [ 'http://www.xiachufang.com/explore/', 'http://www.xiachufang.com/explore/rising/', 'http://www.xiachufang.com/explore/head/', 'http://www.xiachufang.com/explore/honor/', 'http://www.xiachufang.com/explore/monthhonor/', 'http://www.xiachufang.com/explore/created/', 'http://www.xiachufang.com/explore/menu/pop/', ] def parse(self, response): # print(response.body) filename = response.url.split("/")[-2] for quote in response.xpath("//ul[@class='list']/li"): linkurl=(quote.xpath(".//a/@href").extract_first()) title=(quote.xpath(".//div[@class='info pure-u']/p[@class='name']/a/text()").extract_first()) if(title !=None): title=(title.strip().replace('\n','')) outerdescrib=(quote.xpath(".//p[@class='ing ellipsis']").extract_first()) dr = re.compile(r'<[^>]+>',re.S) outerdescrib = dr.sub('',outerdescrib) # if(outerdescrib !=None): # outerdescrib=(outerdescrib.strip().replace('\n','')) if linkurl is not None: yield scrapy.Request(response.urljoin(linkurl),meta={'filename':filename,'linkurl': linkurl,'title':title,'outerdescrib':outerdescrib},callback=self.doparse) # yield { # 'linkurl':linkurl, # 'title':title, # 'outerdescrib':outerdescrib # } #next_page_url = response.xpath('//a[@class="next"]/@href').extract_first() #if next_page_url is not None: # yield scrapy.Request(response.urljoin(next_page_url),callback=self.parse) def doparse(self,response): db = MySQLdb.connect("127.0.0.1","root","Coupon@123","caipu",use_unicode=True,charset="utf8") # db = MySQLdb.connect("127.0.0.1","root","123456","menu",use_unicode=True,charset="utf8") cursor = db.cursor() title = response.meta['title'] outerdescrib = response.meta['outerdescrib'] filename = response.meta['filename'] figure=response.xpath("//div[@class='cover image expandable block-negative-margin']/img/@src").extract_first() describ=response.xpath("//div[@itemprop='description']/text()").extract_first() if(describ!='None' and describ != None and describ!=''): describ=describ.strip().replace('\n','') materials={} a={} for mater in response.xpath("//tr[@itemprop='recipeIngredient']"): sindex='' if(mater.xpath(".//td[@class='name']/text()").extract_first().strip().replace('\n','')==''): sindex=mater.xpath("./td[@class='name']/a/text()").extract_first() else: sindex=mater.xpath(".//td[@class='name']/text()").extract_first() sindex=sindex.strip().replace('\n','') materials[sindex]=mater.xpath(".//td[@class='unit']/text()").extract_first().strip().replace('\n','') created_at=response.xpath("//span[@itemprop='datePublished']/text()").extract_first().strip().replace('\n','') #转换成时间数组 timeArray = time.strptime(created_at, "%Y-%m-%d %H:%M:%S") #转换成时间戳 timestamp = time.mktime(timeArray) materialsStr='' for key in materials: materialsStr=materialsStr+key+","+materials[key]+";" materials=materialsStr.rstrip(';') sqlone = "SELECT * FROM menu WHERE title = '%s'" % (title) try: # 执行SQL语句 cursor.execute(sqlone) # 获取所有记录列表 results=cursor.fetchall() if(len(results)==0): sql = "INSERT INTO menu(title,tags,intro,ingredients,url,burden,created_at,origin,status) VALUES ('%s','%s','%s','%s','%s' ,'%s' ,'%s' ,'%s' ,'%s' )"%(title,outerdescrib,describ,materials,figure,' ',timestamp,filename,1) try: cursor.execute(sql) db.commit() tag_id = int(cursor.lastrowid) except: tag_id = 1 db.rollback() for step in response.xpath("//li[@class='container']"): stepSql="INSERT INTO menu_step(url,menu_id,step) VALUES ( '%s','%s','%s' )"%(step.xpath("./img/@src").extract_first(),tag_id,step.xpath("./p[@class='text']/text()").extract_first().strip()) try: cursor.execute(stepSql) db.commit() except: db.rollback() except: print(1) # 关闭数据库连接 db.close()