# encoding=utf8 from scrapy.selector import Selector import scrapy import MySQLdb import time from pyquery import PyQuery as pq class quyaqu(scrapy.Spider): name="test" start_urls=[ 'http://m.39.net/woman/nxbj/', 'http://m.39.net/woman/nxqg/', 'http://m.39.net/woman/nxyy/', 'http://m.39.net/woman/nxht/', 'http://m.39.net/woman/nxaq/', 'http://m.39.net/woman/mrj/', 'http://m.39.net/woman/hwxz/', 'http://m.39.net/woman/jdsh/', 'http://m.39.net/woman/nxzx/', 'http://m.39.net/woman/nxsh/', 'http://m.39.net/woman/sms/', 'http://m.39.net/woman/xmt/', # 'http://m.39.net/food/nutrition/', # 'http://m.39.net/food/pr/', # 'http://m.39.net/food/sjys/', # 'http://m.39.net/food/ms/', # 'http://m.39.net/food/ylj/', # 'http://m.39.net/food/ttys/', # 'http://m.39.net/food/yl/', # 'http://m.39.net/food/slys/', # 'http://m.39.net/food/dzys/', # 'http://m.39.net/food/zt/', # 'http://m.39.net/food/xmt/' 'http://m.39.net/woman/zt/', 'http://m.39.net/woman/baike/nxby/' ] index=1 def parse(self, response): filename = response.url.split("/")[-2] # fhtml=open(filename, 'a') for quote in response.xpath("//li/a"): linkurl=(quote.xpath("./@href").extract_first()) imgsrc=(quote.xpath("./img/@src").extract_first()) spantitle=(quote.xpath("./span/text()").extract_first()) if linkurl is not None: yield scrapy.Request(response.urljoin(linkurl),meta={'linkurl': linkurl,'imgsrc':imgsrc,'filename':filename,'spantitle':spantitle},callback=self.doparse) # fhtml.write(linkurl+ "\n") # fhtml.close() next_page_url = response.xpath('//span[@class="nextPage"]/a/@href').extract_first() if next_page_url is not None: yield scrapy.Request(response.urljoin(next_page_url),callback=self.parse) def doparse(self,response): # filename = response.url.split("/")[-2]+str(self.index) # self.index=int(self.index)+1 # with open(filename, 'ab') as f: # f.write(response.body) # f.close() filename = response.meta['filename'] spantitle = response.meta['spantitle'] linkurl = response.meta['linkurl'] content=response.xpath('//div[@id="mArt_ps"]').extract_first() releaseTime=response.xpath('//div[@class="info w100 ov color3"]/span[1]/text()').extract_first() imgsrc=response.xpath('//div[@id="mArt_ps"]/p//img[1]/@src').extract_first() # print(releaseTime) # content=content.replace(u'\xa0', u' ') db = MySQLdb.connect("127.0.0.1","root","123456","ymgj",use_unicode=True,charset="utf8") cursor = db.cursor() sql = "INSERT INTO news(title,imgUrl,linkUrl,content,createTime,category) VALUES ('%s', '%s','%s','%s','%s','%s')"%(spantitle,imgsrc,linkurl,content,releaseTime,filename) try: # # 执行sql语句 cursor.execute(sql) # 提交到数据库执行 db.commit() except: # # Rollback in case there is any error db.rollback() # 关闭数据库连接 db.close() # for quotein in response.xpath("//li/a"): # sel = Selector(response) # sites = sel.xpath('//img') # for site in sites: # title = site.xpath('@src').extract() # print(title) # sel = Selector(response) # sites = sel.xpath('//li/a') # filename = response.url.split("/")[-2] # fhtml=open(filename, 'a') # for site in sites: # title = site.xpath('@src').extract() # fhtml.write("".join(title)+ "\n") # fhtml.close()