暫無描述

foodEveryday.py 4.4KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101
  1. # encoding=utf8
  2. from scrapy.selector import Selector
  3. import scrapy
  4. import MySQLdb
  5. import time
  6. import random
  7. from pyquery import PyQuery as pq
  8. class quyaqu(scrapy.Spider):
  9. name="foodEveryday"
  10. start_urls=[
  11. 'http://m.39.net/food/nutrition/',
  12. 'http://m.39.net/food/pr/',
  13. 'http://m.39.net/food/sjys/',
  14. 'http://m.39.net/food/ms/',
  15. 'http://m.39.net/food/ylj/',
  16. 'http://m.39.net/food/ttys/',
  17. 'http://m.39.net/food/yl/',
  18. 'http://m.39.net/food/slys/',
  19. 'http://m.39.net/food/dzys/',
  20. 'http://m.39.net/food/zt/',
  21. 'http://m.39.net/food/xmt/'
  22. ]
  23. def parse(self, response):
  24. filename = response.url.split("/")[-2]
  25. # fhtml=open(filename, 'a')
  26. for quote in response.xpath("//li/a"):
  27. linkurl=(quote.xpath("./@href").extract_first())
  28. imgsrc=(quote.xpath("./img/@src").extract_first())
  29. if imgsrc is not None:
  30. spantitle=(quote.xpath("./span/text()").extract_first())
  31. if spantitle is None:
  32. spantitle=(quote.xpath("./text()").extract_first())
  33. if linkurl is not None:
  34. yield scrapy.Request(response.urljoin(linkurl),meta={'linkurl': linkurl,'imgsrc':imgsrc,'filename':filename,'spantitle':spantitle},callback=self.doparse)
  35. # fhtml.write(linkurl+ "\n")
  36. # fhtml.close()
  37. # next_page_url = response.xpath('//span[@class="nextPage"]/a/@href').extract_first()
  38. # if next_page_url is not None:
  39. # yield scrapy.Request(response.urljoin(next_page_url),callback=self.parse)
  40. def doparse(self,response):
  41. # filename = response.url.split("/")[-2]+str(self.index)
  42. # self.index=int(self.index)+1
  43. # with open(filename, 'ab') as f:
  44. # f.write(response.body)
  45. # f.close()
  46. filename = response.meta['filename']
  47. spantitle = response.meta['spantitle']
  48. linkurl = response.meta['linkurl']
  49. content=response.xpath('//div[@id="mArt_ps"]').extract_first()
  50. releaseTime=response.xpath('//div[@class="info w100 ov color3"]/span[1]/text()').extract_first()
  51. imgsrc=response.xpath('//div[@id="mArt_ps"]/p//img[1]/@src').extract_first()
  52. # print(releaseTime)
  53. # content=content.replace(u'\xa0', u' ')
  54. db = MySQLdb.connect("127.0.0.1","root","Coupon@123","caipu",use_unicode=True,charset="utf8")
  55. # db = MySQLdb.connect("127.0.0.1","root","123456","menu",use_unicode=True,charset="utf8")
  56. cursor = db.cursor()
  57. # sql = "INSERT INTO news(title,imgUrl,linkUrl,content,createTime,category,readNum) VALUES ('%s', '%s','%s','%s','%s','%s','%s')"%(spantitle,imgsrc,linkurl,content,releaseTime,filename)
  58. # try:
  59. # # # 执行sql语句
  60. # cursor.execute(sql)
  61. # # 提交到数据库执行
  62. # db.commit()
  63. # except:
  64. # # # Rollback in case there is any error
  65. # db.rollback()
  66. sqlone = "SELECT * FROM news WHERE linkUrl = '%s'" % (linkurl);
  67. try:
  68. # 执行SQL语句
  69. cursor.execute(sqlone)
  70. # 获取所有记录列表
  71. results=cursor.fetchall()
  72. if(len(results)==0):
  73. sql = "INSERT INTO news(title,imgUrl,linkUrl,content,createTime,category,readNum) VALUES ('%s', '%s','%s','%s','%s','%s','%s')"%(spantitle,imgsrc,linkurl,content,releaseTime,filename,int(random.randrange(500))+int(500))
  74. try:
  75. # # 执行sql语句
  76. cursor.execute(sql)
  77. # 提交到数据库执行
  78. db.commit()
  79. except:
  80. # # Rollback in case there is any error
  81. db.rollback()
  82. except:
  83. print(1);
  84. # 关闭数据库连接
  85. db.close()
  86. # for quotein in response.xpath("//li/a"):
  87. # sel = Selector(response)
  88. # sites = sel.xpath('//img')
  89. # for site in sites:
  90. # title = site.xpath('@src').extract()
  91. # print(title)
  92. # sel = Selector(response)
  93. # sites = sel.xpath('//li/a')
  94. # filename = response.url.split("/")[-2]
  95. # fhtml=open(filename, 'a')
  96. # for site in sites:
  97. # title = site.xpath('@src').extract()
  98. # fhtml.write("".join(title)+ "\n")
  99. # fhtml.close()