Bez popisu

meunSpider.py 4.6KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697
  1. # -*- coding: utf-8 -*-
  2. import scrapy
  3. import MySQLdb
  4. import time
  5. import re
  6. class MeunspiderSpider(scrapy.Spider):
  7. name = 'menuSpider'
  8. allowed_domains = ['www.xiachufang.com']
  9. start_urls = [
  10. 'http://www.xiachufang.com/explore/',
  11. 'http://www.xiachufang.com/explore/rising/',
  12. 'http://www.xiachufang.com/explore/head/',
  13. 'http://www.xiachufang.com/explore/honor/',
  14. 'http://www.xiachufang.com/explore/monthhonor/',
  15. 'http://www.xiachufang.com/explore/created/',
  16. 'http://www.xiachufang.com/explore/menu/pop/',
  17. ]
  18. def parse(self, response):
  19. filename = response.url.split("/")[-2]
  20. for quote in response.xpath("//ul[@class='list']/li"):
  21. linkurl=(quote.xpath(".//a/@href").extract_first())
  22. title=(quote.xpath(".//div[@class='info pure-u']/p[@class='name']/a/text()").extract_first())
  23. if(title !=None):
  24. title=(title.strip().replace('\n',''))
  25. outerdescrib=(quote.xpath(".//p[@class='ing ellipsis']").extract_first())
  26. dr = re.compile(r'<[^>]+>',re.S)
  27. outerdescrib = dr.sub('',outerdescrib)
  28. if linkurl is not None:
  29. yield scrapy.Request(response.urljoin(linkurl),meta={'filename':filename,'linkurl': linkurl,'title':title,'outerdescrib':outerdescrib},callback=self.doparse)
  30. # yield {
  31. # 'linkurl':linkurl,
  32. # 'title':title,
  33. # 'outerdescrib':outerdescrib
  34. # }
  35. next_page_url = response.xpath('//a[@class="next"]/@href').extract_first()
  36. if next_page_url is not None:
  37. yield scrapy.Request(response.urljoin(next_page_url),callback=self.parse)
  38. def doparse(self,response):
  39. # db = MySQLdb.connect("127.0.0.1","root","123456","menu",use_unicode=True,charset="utf8")
  40. db = MySQLdb.connect("127.0.0.1","root","Coupon@123","caipu",use_unicode=True,charset="utf8")
  41. cursor = db.cursor()
  42. title = response.meta['title']
  43. outerdescrib = response.meta['outerdescrib']
  44. filename = response.meta['filename']
  45. figure=response.xpath("//div[@class='cover image expandable block-negative-margin']/img/@src").extract_first()
  46. describ=response.xpath("//div[@itemprop='description']/text()").extract_first()
  47. if(describ!='None' and describ != None and describ!=''):
  48. describ=describ.strip().replace('\n','')
  49. materials={}
  50. a={}
  51. for mater in response.xpath("//tr[@itemprop='recipeIngredient']"):
  52. sindex=''
  53. if(mater.xpath(".//td[@class='name']/text()").extract_first().strip().replace('\n','')==''):
  54. sindex=mater.xpath(".//td[@class='name']/a/text()").extract_first()
  55. else:
  56. sindex=mater.xpath(".//td[@class='name']/text()").extract_first()
  57. sindex=sindex.strip().replace('\n','')
  58. materials[sindex]=mater.xpath("./td[@class='unit']/text()").extract_first().strip().replace('\n','')
  59. created_at=response.xpath("//span[@itemprop='datePublished']/text()").extract_first().strip().replace('\n','')
  60. #转换成时间数组
  61. timeArray = time.strptime(created_at, "%Y-%m-%d %H:%M:%S")
  62. #转换成时间戳
  63. timestamp = time.mktime(timeArray)
  64. materialsStr=''
  65. for key in materials:
  66. materialsStr=materialsStr+key+","+materials[key]+";"
  67. materials=materialsStr.rstrip(';')
  68. sql = "INSERT INTO menu(title,tags,intro,ingredients,url,burden,created_at,origin,status) VALUES ('%s','%s','%s','%s','%s' ,'%s' ,'%s' ,'%s' ,'%s' )"%(title,outerdescrib,describ,materials,figure,' ',timestamp,filename,1)
  69. try:
  70. # # 执行sql语句
  71. cursor.execute(sql)
  72. # 提交到数据库执行
  73. db.commit()
  74. tag_id = int(cursor.lastrowid)
  75. except:
  76. # # Rollback in case there is any error
  77. tag_id = 1
  78. db.rollback()
  79. for step in response.xpath("//li[@class='container']"):
  80. stepSql="INSERT INTO menu_step(url,menu_id,step) VALUES ( '%s','%s','%s' )"%(step.xpath("./img/@src").extract_first(),tag_id,step.xpath("./p[@class='text']/text()").extract_first().strip())
  81. # cursor.execute(stepSql)
  82. # # 提交到数据库执行
  83. # db.commit()
  84. try:
  85. # # 执行sql语句
  86. cursor.execute(stepSql)
  87. # 提交到数据库执行
  88. db.commit()
  89. except:
  90. # # Rollback in case there is any error
  91. db.rollback()
  92. # 关闭数据库连接
  93. db.close()