123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101 |
- # encoding=utf8
- from scrapy.selector import Selector
- import scrapy
- import MySQLdb
- import time
- import random
- from pyquery import PyQuery as pq
- class quyaqu(scrapy.Spider):
- name="foodEveryday"
- start_urls=[
- 'http://m.39.net/food/nutrition/',
- 'http://m.39.net/food/pr/',
- 'http://m.39.net/food/sjys/',
- 'http://m.39.net/food/ms/',
- 'http://m.39.net/food/ylj/',
- 'http://m.39.net/food/ttys/',
- 'http://m.39.net/food/yl/',
- 'http://m.39.net/food/slys/',
- 'http://m.39.net/food/dzys/',
- 'http://m.39.net/food/zt/',
- 'http://m.39.net/food/xmt/'
- ]
- def parse(self, response):
- filename = response.url.split("/")[-2]
- # fhtml=open(filename, 'a')
- for quote in response.xpath("//li/a"):
- linkurl=(quote.xpath("./@href").extract_first())
- imgsrc=(quote.xpath("./img/@src").extract_first())
- if imgsrc is not None:
- spantitle=(quote.xpath("./span/text()").extract_first())
- if spantitle is None:
- spantitle=(quote.xpath("./text()").extract_first())
- if linkurl is not None:
- yield scrapy.Request(response.urljoin(linkurl),meta={'linkurl': linkurl,'imgsrc':imgsrc,'filename':filename,'spantitle':spantitle},callback=self.doparse)
- # fhtml.write(linkurl+ "\n")
- # fhtml.close()
- # next_page_url = response.xpath('//span[@class="nextPage"]/a/@href').extract_first()
- # if next_page_url is not None:
- # yield scrapy.Request(response.urljoin(next_page_url),callback=self.parse)
- def doparse(self,response):
- # filename = response.url.split("/")[-2]+str(self.index)
- # self.index=int(self.index)+1
- # with open(filename, 'ab') as f:
- # f.write(response.body)
- # f.close()
- filename = response.meta['filename']
- spantitle = response.meta['spantitle']
- linkurl = response.meta['linkurl']
- content=response.xpath('//div[@id="mArt_ps"]').extract_first()
- releaseTime=response.xpath('//div[@class="info w100 ov color3"]/span[1]/text()').extract_first()
- imgsrc=response.xpath('//div[@id="mArt_ps"]/p//img[1]/@src').extract_first()
- # print(releaseTime)
- # content=content.replace(u'\xa0', u' ')
- db = MySQLdb.connect("127.0.0.1","root","Coupon@123","caipu",use_unicode=True,charset="utf8")
- # db = MySQLdb.connect("127.0.0.1","root","123456","menu",use_unicode=True,charset="utf8")
- cursor = db.cursor()
- # sql = "INSERT INTO news(title,imgUrl,linkUrl,content,createTime,category,readNum) VALUES ('%s', '%s','%s','%s','%s','%s','%s')"%(spantitle,imgsrc,linkurl,content,releaseTime,filename)
- # try:
- # # # 执行sql语句
- # cursor.execute(sql)
- # # 提交到数据库执行
- # db.commit()
- # except:
- # # # Rollback in case there is any error
- # db.rollback()
- sqlone = "SELECT * FROM news WHERE linkUrl = '%s'" % (linkurl);
- try:
- # 执行SQL语句
- cursor.execute(sqlone)
- # 获取所有记录列表
- results=cursor.fetchall()
- if(len(results)==0):
- sql = "INSERT INTO news(title,imgUrl,linkUrl,content,createTime,category,readNum) VALUES ('%s', '%s','%s','%s','%s','%s','%s')"%(spantitle,imgsrc,linkurl,content,releaseTime,filename,int(random.randrange(500))+int(500))
- try:
- # # 执行sql语句
- cursor.execute(sql)
- # 提交到数据库执行
- db.commit()
- except:
- # # Rollback in case there is any error
- db.rollback()
- except:
- print(1);
- # 关闭数据库连接
- db.close()
- # for quotein in response.xpath("//li/a"):
- # sel = Selector(response)
- # sites = sel.xpath('//img')
- # for site in sites:
- # title = site.xpath('@src').extract()
- # print(title)
- # sel = Selector(response)
- # sites = sel.xpath('//li/a')
- # filename = response.url.split("/")[-2]
- # fhtml=open(filename, 'a')
- # for site in sites:
- # title = site.xpath('@src').extract()
- # fhtml.write("".join(title)+ "\n")
- # fhtml.close()
|