# encoding=utf8 from scrapy.selector import Selector import scrapy import MySQLdb import time import random from pyquery import PyQuery as pq class quyaqu(scrapy.Spider): name="foodEveryday" start_urls=[ 'http://m.39.net/food/nutrition/', 'http://m.39.net/food/pr/', 'http://m.39.net/food/sjys/', 'http://m.39.net/food/ms/', 'http://m.39.net/food/ylj/', 'http://m.39.net/food/ttys/', 'http://m.39.net/food/yl/', 'http://m.39.net/food/slys/', 'http://m.39.net/food/dzys/', 'http://m.39.net/food/zt/', 'http://m.39.net/food/xmt/' ] def parse(self, response): filename = response.url.split("/")[-2] # fhtml=open(filename, 'a') for quote in response.xpath("//li/a"): linkurl=(quote.xpath("./@href").extract_first()) imgsrc=(quote.xpath("./img/@src").extract_first()) if imgsrc is not None: spantitle=(quote.xpath("./span/text()").extract_first()) if spantitle is None: spantitle=(quote.xpath("./text()").extract_first()) if linkurl is not None: yield scrapy.Request(response.urljoin(linkurl),meta={'linkurl': linkurl,'imgsrc':imgsrc,'filename':filename,'spantitle':spantitle},callback=self.doparse) # fhtml.write(linkurl+ "\n") # fhtml.close() # next_page_url = response.xpath('//span[@class="nextPage"]/a/@href').extract_first() # if next_page_url is not None: # yield scrapy.Request(response.urljoin(next_page_url),callback=self.parse) def doparse(self,response): # filename = response.url.split("/")[-2]+str(self.index) # self.index=int(self.index)+1 # with open(filename, 'ab') as f: # f.write(response.body) # f.close() filename = response.meta['filename'] spantitle = response.meta['spantitle'] linkurl = response.meta['linkurl'] content=response.xpath('//div[@id="mArt_ps"]').extract_first() releaseTime=response.xpath('//div[@class="info w100 ov color3"]/span[1]/text()').extract_first() imgsrc=response.xpath('//div[@id="mArt_ps"]/p//img[1]/@src').extract_first() # print(releaseTime) # content=content.replace(u'\xa0', u' ') db = MySQLdb.connect("127.0.0.1","root","Coupon@123","caipu",use_unicode=True,charset="utf8") # db = MySQLdb.connect("127.0.0.1","root","123456","menu",use_unicode=True,charset="utf8") cursor = db.cursor() # sql = "INSERT INTO news(title,imgUrl,linkUrl,content,createTime,category,readNum) VALUES ('%s', '%s','%s','%s','%s','%s','%s')"%(spantitle,imgsrc,linkurl,content,releaseTime,filename) # try: # # # 执行sql语句 # cursor.execute(sql) # # 提交到数据库执行 # db.commit() # except: # # # Rollback in case there is any error # db.rollback() sqlone = "SELECT * FROM news WHERE linkUrl = '%s'" % (linkurl); try: # 执行SQL语句 cursor.execute(sqlone) # 获取所有记录列表 results=cursor.fetchall() if(len(results)==0): sql = "INSERT INTO news(title,imgUrl,linkUrl,content,createTime,category,readNum) VALUES ('%s', '%s','%s','%s','%s','%s','%s')"%(spantitle,imgsrc,linkurl,content,releaseTime,filename,int(random.randrange(500))+int(500)) try: # # 执行sql语句 cursor.execute(sql) # 提交到数据库执行 db.commit() except: # # Rollback in case there is any error db.rollback() except: print(1); # 关闭数据库连接 db.close() # for quotein in response.xpath("//li/a"): # sel = Selector(response) # sites = sel.xpath('//img') # for site in sites: # title = site.xpath('@src').extract() # print(title) # sel = Selector(response) # sites = sel.xpath('//li/a') # filename = response.url.split("/")[-2] # fhtml=open(filename, 'a') # for site in sites: # title = site.xpath('@src').extract() # fhtml.write("".join(title)+ "\n") # fhtml.close()