13283339616
/
python


			
							12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697
							# -*- coding: utf-8 -*-
import scrapy
import MySQLdb
import time
import re


class MeunspiderSpider(scrapy.Spider):
    name = 'menuSpider'
    allowed_domains = ['www.xiachufang.com']
    start_urls = [
        'http://www.xiachufang.com/explore/',
        'http://www.xiachufang.com/explore/rising/',
        'http://www.xiachufang.com/explore/head/',
        'http://www.xiachufang.com/explore/honor/',
        'http://www.xiachufang.com/explore/monthhonor/',
        'http://www.xiachufang.com/explore/created/',
        'http://www.xiachufang.com/explore/menu/pop/',
    ]

    def parse(self, response):
        filename = response.url.split("/")[-2]
        for quote in response.xpath("//ul[@class='list']/li"):
            linkurl=(quote.xpath(".//a/@href").extract_first())
            title=(quote.xpath(".//div[@class='info pure-u']/p[@class='name']/a/text()").extract_first())
            if(title !=None):
                title=(title.strip().replace('\n',''))
            outerdescrib=(quote.xpath(".//p[@class='ing ellipsis']").extract_first())
            dr = re.compile(r'<[^>]+>',re.S)
            outerdescrib = dr.sub('',outerdescrib)
            if linkurl is not None:
                yield scrapy.Request(response.urljoin(linkurl),meta={'filename':filename,'linkurl': linkurl,'title':title,'outerdescrib':outerdescrib},callback=self.doparse)
            # yield {
            #     'linkurl':linkurl,
            #     'title':title,
            #     'outerdescrib':outerdescrib
            # }
        next_page_url = response.xpath('//a[@class="next"]/@href').extract_first()
        if next_page_url is not None:
            yield scrapy.Request(response.urljoin(next_page_url),callback=self.parse)
    def doparse(self,response):
        # db = MySQLdb.connect("127.0.0.1","root","123456","menu",use_unicode=True,charset="utf8")
        db = MySQLdb.connect("127.0.0.1","root","Coupon@123","caipu",use_unicode=True,charset="utf8")
        cursor = db.cursor()
        title = response.meta['title']
        outerdescrib = response.meta['outerdescrib']
        filename = response.meta['filename']
        figure=response.xpath("//div[@class='cover image expandable block-negative-margin']/img/@src").extract_first()
        describ=response.xpath("//div[@itemprop='description']/text()").extract_first()
        if(describ!='None' and describ != None and describ!=''):
            describ=describ.strip().replace('\n','')
        materials={}
        a={}
        for mater in response.xpath("//tr[@itemprop='recipeIngredient']"):
            sindex=''
            if(mater.xpath(".//td[@class='name']/text()").extract_first().strip().replace('\n','')==''):
                sindex=mater.xpath(".//td[@class='name']/a/text()").extract_first()
            else:
                sindex=mater.xpath(".//td[@class='name']/text()").extract_first()
            sindex=sindex.strip().replace('\n','')
            materials[sindex]=mater.xpath("./td[@class='unit']/text()").extract_first().strip().replace('\n','')
        created_at=response.xpath("//span[@itemprop='datePublished']/text()").extract_first().strip().replace('\n','')
        #转换成时间数组
        timeArray = time.strptime(created_at, "%Y-%m-%d %H:%M:%S")
        #转换成时间戳
        timestamp = time.mktime(timeArray)
        materialsStr=''
        for key in materials:
            materialsStr=materialsStr+key+","+materials[key]+";"
        materials=materialsStr.rstrip(';')
        sql = "INSERT INTO menu(title,tags,intro,ingredients,url,burden,created_at,origin,status) VALUES ('%s','%s','%s','%s','%s' ,'%s' ,'%s' ,'%s' ,'%s' )"%(title,outerdescrib,describ,materials,figure,' ',timestamp,filename,1)

        try:
            # # 执行sql语句
            cursor.execute(sql)
            # 提交到数据库执行
            db.commit()
            tag_id = int(cursor.lastrowid)
        except:
            #     # Rollback in case there is any error
            tag_id = 1
            db.rollback()
        for step in response.xpath("//li[@class='container']"):
            stepSql="INSERT INTO menu_step(url,menu_id,step) VALUES ( '%s','%s','%s' )"%(step.xpath("./img/@src").extract_first(),tag_id,step.xpath("./p[@class='text']/text()").extract_first().strip())
            # cursor.execute(stepSql)
            # # 提交到数据库执行
            # db.commit()
            try:
                # # 执行sql语句
                cursor.execute(stepSql)
                # 提交到数据库执行
                db.commit()
            except:
                #     # Rollback in case there is any error
                db.rollback()
        # 关闭数据库连接
        db.close()