No Description

meunSpiderEveryday.py 4.8KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100
  1. # -*- coding: utf-8 -*-
  2. import scrapy
  3. import MySQLdb
  4. import time
  5. import re
  6. class MeunspiderSpider(scrapy.Spider):
  7. name = 'meunSpiderEveryday'
  8. allowed_domains = ['www.xiachufang.com']
  9. start_urls = [
  10. 'http://www.xiachufang.com/explore/',
  11. 'http://www.xiachufang.com/explore/rising/',
  12. 'http://www.xiachufang.com/explore/head/',
  13. 'http://www.xiachufang.com/explore/honor/',
  14. 'http://www.xiachufang.com/explore/monthhonor/',
  15. 'http://www.xiachufang.com/explore/created/',
  16. 'http://www.xiachufang.com/explore/menu/pop/',
  17. ]
  18. def parse(self, response):
  19. # print(response.body)
  20. filename = response.url.split("/")[-2]
  21. for quote in response.xpath("//ul[@class='list']/li"):
  22. linkurl=(quote.xpath(".//a/@href").extract_first())
  23. title=(quote.xpath(".//div[@class='info pure-u']/p[@class='name']/a/text()").extract_first())
  24. if(title !=None):
  25. title=(title.strip().replace('\n',''))
  26. outerdescrib=(quote.xpath(".//p[@class='ing ellipsis']").extract_first())
  27. dr = re.compile(r'<[^>]+>',re.S)
  28. outerdescrib = dr.sub('',outerdescrib)
  29. # if(outerdescrib !=None):
  30. # outerdescrib=(outerdescrib.strip().replace('\n',''))
  31. if linkurl is not None:
  32. yield scrapy.Request(response.urljoin(linkurl),meta={'filename':filename,'linkurl': linkurl,'title':title,'outerdescrib':outerdescrib},callback=self.doparse)
  33. # yield {
  34. # 'linkurl':linkurl,
  35. # 'title':title,
  36. # 'outerdescrib':outerdescrib
  37. # }
  38. #next_page_url = response.xpath('//a[@class="next"]/@href').extract_first()
  39. #if next_page_url is not None:
  40. # yield scrapy.Request(response.urljoin(next_page_url),callback=self.parse)
  41. def doparse(self,response):
  42. db = MySQLdb.connect("127.0.0.1","root","Coupon@123","caipu",use_unicode=True,charset="utf8")
  43. # db = MySQLdb.connect("127.0.0.1","root","123456","menu",use_unicode=True,charset="utf8")
  44. cursor = db.cursor()
  45. title = response.meta['title']
  46. outerdescrib = response.meta['outerdescrib']
  47. filename = response.meta['filename']
  48. figure=response.xpath("//div[@class='cover image expandable block-negative-margin']/img/@src").extract_first()
  49. describ=response.xpath("//div[@itemprop='description']/text()").extract_first()
  50. if(describ!='None' and describ != None and describ!=''):
  51. describ=describ.strip().replace('\n','')
  52. materials={}
  53. a={}
  54. for mater in response.xpath("//tr[@itemprop='recipeIngredient']"):
  55. sindex=''
  56. if(mater.xpath(".//td[@class='name']/text()").extract_first().strip().replace('\n','')==''):
  57. sindex=mater.xpath("./td[@class='name']/a/text()").extract_first()
  58. else:
  59. sindex=mater.xpath(".//td[@class='name']/text()").extract_first()
  60. sindex=sindex.strip().replace('\n','')
  61. materials[sindex]=mater.xpath(".//td[@class='unit']/text()").extract_first().strip().replace('\n','')
  62. created_at=response.xpath("//span[@itemprop='datePublished']/text()").extract_first().strip().replace('\n','')
  63. #转换成时间数组
  64. timeArray = time.strptime(created_at, "%Y-%m-%d %H:%M:%S")
  65. #转换成时间戳
  66. timestamp = time.mktime(timeArray)
  67. materialsStr=''
  68. for key in materials:
  69. materialsStr=materialsStr+key+","+materials[key]+";"
  70. materials=materialsStr.rstrip(';')
  71. sqlone = "SELECT * FROM menu WHERE title = '%s'" % (title)
  72. try:
  73. # 执行SQL语句
  74. cursor.execute(sqlone)
  75. # 获取所有记录列表
  76. results=cursor.fetchall()
  77. if(len(results)==0):
  78. sql = "INSERT INTO menu(title,tags,intro,ingredients,url,burden,created_at,origin,status) VALUES ('%s','%s','%s','%s','%s' ,'%s' ,'%s' ,'%s' ,'%s' )"%(title,outerdescrib,describ,materials,figure,' ',timestamp,filename,1)
  79. try:
  80. cursor.execute(sql)
  81. db.commit()
  82. tag_id = int(cursor.lastrowid)
  83. except:
  84. tag_id = 1
  85. db.rollback()
  86. for step in response.xpath("//li[@class='container']"):
  87. stepSql="INSERT INTO menu_step(url,menu_id,step) VALUES ( '%s','%s','%s' )"%(step.xpath("./img/@src").extract_first(),tag_id,step.xpath("./p[@class='text']/text()").extract_first().strip())
  88. try:
  89. cursor.execute(stepSql)
  90. db.commit()
  91. except:
  92. db.rollback()
  93. except:
  94. print(1)
  95. # 关闭数据库连接
  96. db.close()