13283339616 6 年之前
父節點
當前提交
d6b56912c4
共有 2 個文件被更改,包括 17 次插入15 次删除
  1. 6 5
      xiaowu/spiders/food.py
  2. 11 10
      xiaowu/spiders/foodEveryday.py

+ 6 - 5
xiaowu/spiders/food.py

30
             linkurl=(quote.xpath("./@href").extract_first())
30
             linkurl=(quote.xpath("./@href").extract_first())
31
             imgsrc=(quote.xpath("./img/@src").extract_first())
31
             imgsrc=(quote.xpath("./img/@src").extract_first())
32
             spantitle=(quote.xpath("./span/text()").extract_first())
32
             spantitle=(quote.xpath("./span/text()").extract_first())
33
-            if spantitle is None:
34
-                spantitle=(quote.xpath("./text()").extract_first())
35
-            if linkurl is not None:
36
-                yield scrapy.Request(response.urljoin(linkurl),meta={'linkurl': linkurl,'imgsrc':imgsrc,'filename':filename,'spantitle':spantitle},callback=self.doparse)
37
-                # fhtml.write(linkurl+ "\n")
33
+            if imgsrc is not None:
34
+                if spantitle is None:
35
+                    spantitle=(quote.xpath("./text()").extract_first())
36
+                if linkurl is not None:
37
+                    yield scrapy.Request(response.urljoin(linkurl),meta={'linkurl': linkurl,'imgsrc':imgsrc,'filename':filename,'spantitle':spantitle},callback=self.doparse)
38
+                    # fhtml.write(linkurl+ "\n")
38
         # fhtml.close()
39
         # fhtml.close()
39
         next_page_url = response.xpath('//span[@class="nextPage"]/a/@href').extract_first()
40
         next_page_url = response.xpath('//span[@class="nextPage"]/a/@href').extract_first()
40
         if next_page_url is not None:
41
         if next_page_url is not None:

+ 11 - 10
xiaowu/spiders/foodEveryday.py

26
         for quote in response.xpath("//li/a"):
26
         for quote in response.xpath("//li/a"):
27
             linkurl=(quote.xpath("./@href").extract_first())
27
             linkurl=(quote.xpath("./@href").extract_first())
28
             imgsrc=(quote.xpath("./img/@src").extract_first())
28
             imgsrc=(quote.xpath("./img/@src").extract_first())
29
-            spantitle=(quote.xpath("./span/text()").extract_first())
30
-            if spantitle is None:
31
-                spantitle=(quote.xpath("./text()").extract_first())
32
-            if linkurl is not None:
33
-                yield scrapy.Request(response.urljoin(linkurl),meta={'linkurl': linkurl,'imgsrc':imgsrc,'filename':filename,'spantitle':spantitle},callback=self.doparse)
34
-                # fhtml.write(linkurl+ "\n")
35
-                # fhtml.close()
36
-                # next_page_url = response.xpath('//span[@class="nextPage"]/a/@href').extract_first()
37
-                # if next_page_url is not None:
38
-                #     yield scrapy.Request(response.urljoin(next_page_url),callback=self.parse)
29
+            if imgsrc is not None:
30
+                spantitle=(quote.xpath("./span/text()").extract_first())
31
+                if spantitle is None:
32
+                    spantitle=(quote.xpath("./text()").extract_first())
33
+                if linkurl is not None:
34
+                    yield scrapy.Request(response.urljoin(linkurl),meta={'linkurl': linkurl,'imgsrc':imgsrc,'filename':filename,'spantitle':spantitle},callback=self.doparse)
35
+                    # fhtml.write(linkurl+ "\n")
36
+                    # fhtml.close()
37
+                    # next_page_url = response.xpath('//span[@class="nextPage"]/a/@href').extract_first()
38
+                    # if next_page_url is not None:
39
+                    #     yield scrapy.Request(response.urljoin(next_page_url),callback=self.parse)
39
     def doparse(self,response):
40
     def doparse(self,response):
40
         # filename = response.url.split("/")[-2]+str(self.index)
41
         # filename = response.url.split("/")[-2]+str(self.index)
41
         # self.index=int(self.index)+1
42
         # self.index=int(self.index)+1