13283339616
/
python


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100
							# -*- coding: utf-8 -*-
import scrapy
import MySQLdb
import time
import re


class MeunspiderSpider(scrapy.Spider):
    name = 'meunSpiderEveryday'
    allowed_domains = ['www.xiachufang.com']
    start_urls = [
        'http://www.xiachufang.com/explore/',
        'http://www.xiachufang.com/explore/rising/',
        'http://www.xiachufang.com/explore/head/',
        'http://www.xiachufang.com/explore/honor/',
        'http://www.xiachufang.com/explore/monthhonor/',
        'http://www.xiachufang.com/explore/created/',
        'http://www.xiachufang.com/explore/menu/pop/',
    ]

    def parse(self, response):
        # print(response.body)
        filename = response.url.split("/")[-2]
        for quote in response.xpath("//ul[@class='list']/li"):
            linkurl=(quote.xpath(".//a/@href").extract_first())
            title=(quote.xpath(".//div[@class='info pure-u']/p[@class='name']/a/text()").extract_first())
            if(title !=None):
                title=(title.strip().replace('\n',''))
            outerdescrib=(quote.xpath(".//p[@class='ing ellipsis']").extract_first())
            dr = re.compile(r'<[^>]+>',re.S)
            outerdescrib = dr.sub('',outerdescrib)
            # if(outerdescrib !=None):
            #     outerdescrib=(outerdescrib.strip().replace('\n',''))
            if linkurl is not None:
                yield scrapy.Request(response.urljoin(linkurl),meta={'filename':filename,'linkurl': linkurl,'title':title,'outerdescrib':outerdescrib},callback=self.doparse)
            # yield {
            #     'linkurl':linkurl,
            #     'title':title,
            #     'outerdescrib':outerdescrib
            # }
        #next_page_url = response.xpath('//a[@class="next"]/@href').extract_first()
        #if next_page_url is not None:
        #    yield scrapy.Request(response.urljoin(next_page_url),callback=self.parse)
    def doparse(self,response):
        db = MySQLdb.connect("127.0.0.1","root","Coupon@123","caipu",use_unicode=True,charset="utf8")
        # db = MySQLdb.connect("127.0.0.1","root","123456","menu",use_unicode=True,charset="utf8")
        cursor = db.cursor()
        title = response.meta['title']
        outerdescrib = response.meta['outerdescrib']
        filename = response.meta['filename']
        figure=response.xpath("//div[@class='cover image expandable block-negative-margin']/img/@src").extract_first()
        describ=response.xpath("//div[@itemprop='description']/text()").extract_first()
        if(describ!='None' and describ != None and describ!=''):
            describ=describ.strip().replace('\n','')
        materials={}
        a={}
        for mater in response.xpath("//tr[@itemprop='recipeIngredient']"):
            sindex=''
            if(mater.xpath(".//td[@class='name']/text()").extract_first().strip().replace('\n','')==''):
                sindex=mater.xpath("./td[@class='name']/a/text()").extract_first()
            else:
                sindex=mater.xpath(".//td[@class='name']/text()").extract_first()
            sindex=sindex.strip().replace('\n','')
            materials[sindex]=mater.xpath(".//td[@class='unit']/text()").extract_first().strip().replace('\n','')
        created_at=response.xpath("//span[@itemprop='datePublished']/text()").extract_first().strip().replace('\n','')
        #转换成时间数组
        timeArray = time.strptime(created_at, "%Y-%m-%d %H:%M:%S")
		#转换成时间戳
        timestamp = time.mktime(timeArray)
        materialsStr=''
        for key in materials:
            materialsStr=materialsStr+key+","+materials[key]+";"
        materials=materialsStr.rstrip(';')
        sqlone = "SELECT * FROM menu WHERE title = '%s'" % (title)
        try:
            # 执行SQL语句
            cursor.execute(sqlone)
            # 获取所有记录列表
            results=cursor.fetchall()
            if(len(results)==0):
                sql = "INSERT INTO menu(title,tags,intro,ingredients,url,burden,created_at,origin,status) VALUES ('%s','%s','%s','%s','%s' ,'%s' ,'%s' ,'%s' ,'%s' )"%(title,outerdescrib,describ,materials,figure,' ',timestamp,filename,1)
                try:
                    cursor.execute(sql)
                    db.commit()
                    tag_id = int(cursor.lastrowid)
                except:
                    tag_id = 1
                    db.rollback()
                for step in response.xpath("//li[@class='container']"):
                    stepSql="INSERT INTO menu_step(url,menu_id,step) VALUES ( '%s','%s','%s' )"%(step.xpath("./img/@src").extract_first(),tag_id,step.xpath("./p[@class='text']/text()").extract_first().strip())
                    try:
                        cursor.execute(stepSql)
                        db.commit()
                    except:
                        db.rollback()
        except:
          print(1)
		  
        # 关闭数据库连接
        db.close()