前言:
下面给大家介绍将下载的数据存入到Mysql数据库的用法,数据来源是Mooc的课程。
代码实现:
-
items.py
from scrapy import Item,Field class MoocspiderItem(Item): # define the fields for your item here like: # name = scrapy.Field() Url = Field() Title = Field() Image_Url = Field() Student = Field() Introduction = Field()
-
settings.py
BOT_NAME = 'MoocSpider' SPIDER_MODULES = ['MoocSpider.spiders'] NEWSPIDER_MODULE = 'MoocSpider.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'MoocSpider (+http://www.yourdomain.com)' # Obey robots.txt rules ROBOTSTXT_OBEY = True #相信这些代码大家都懂的吧 MYSQL_DB_NAME = 'python_data' MYSQL_HOST='localhost' MYSQL_USER='root' MYSQL_PASSWORD='root' ITEM_PIPELINES = { 'MoocSpider.pipelines.MysqlPipeline':10, }
-
pipelines.py
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html import json from twisted.enterprise import adbapi #这里要说一下,要先安装一下MySQLdb,PYTHON用来对数据库进行操作的库 import MySQLdb #使用了连接池的插入数据库函数 class MysqlPipeline(object): #def __init__(self): #self.file = open('MoocSpider1.json','w',encoding='utf-8') def open_spider(self,spider): db = spider.settings.get('MYSQL_DB_NAME','python_data') host = spider.settings.get('MYSQL_HOST','localhost') port = 3306 user = spider.settings.get('MYSQL_DB_USER','root') passwd = spider.settings.get('MYSQL_DB_PASSWORD','root') self.dbpool = adbapi.ConnectionPool('MySQLdb',host=host,db=db,user=user,passwd=passwd,charset='utf8') def close_spider(self,spider): self.dbpool.close() def process_item(self, item, spider): #读取item中的数据 #line = json.dumps(dict(item), ensure_ascii=False) + "\n" #写入文件 #self.file.write(line) #返回item #return item self.dbpool.runInteraction(self.insert_db,item) def insert_db(self,tx,item): values = ( item['Url'], item['Title'], item['Image_Url'], item['Student'], item['Introduction'], ) sql = 'INSERT INTO books VALUES (%s,%s,%s,%s,%s)' tx.execute(sql,values) #普通的连接函数 class MysqlPipeline1(object): def open_spider(self,spider): db = spider.settings.get('MYSQL_DB_NAME','python_data') host = spider.settings.get('MYSQL_HOST','localhost') port = 3306 user = spider.settings.get('MYSQL_DB_USER','root') passwd = spider.settings.get('MYSQL_DB_PASSWORD','root') self.db_conn = MySQLdb.connect(host=host,port=port,db=db,user=user,passwd=passwd,charset='utf8') self.db_cur = self.db_conn.cursor() def process_item(self, item, spider): #读取item中的数据 #line = json.dumps(dict(item), ensure_ascii=False) + "\n" #写入文件 #self.file.write(line) #返回item #return item try: self.insert_db(item) self.db_conn.commit() except Exception as error: print(error) return item def insert_db(self,item): values = ( item['Url'], item['Title'], item['Image_Url'], item['Student'], item['Introduction'] ) sql = 'INSERT INTO books VALUES (%s,%s,%s,%s,%s)' self.db_cur.execute(sql,values)
-
MoocSpider.py
import scrapy from MoocSpider.items import MoocspiderItem from scrapy.selector import Selector class MoocSpider(scrapy.Spider): name = 'MoocSpider' allowed_domains = ['imooc.com'] start_urls = ['http://www.imooc.com/course/list'] def parse(self, response): html = Selector(response) item = MoocspiderItem() base_url = 'http://www.imooc.com' content = html.xpath('//div[@class="course-card-container"]') for each in content: item = MoocspiderItem() item['Url'] = base_url + each.xpath('.//a/@href').extract_first() item['Title'] = each.xpath('.//a/div/h3/text()').extract_first() item['Image_Url'] = each.xpath('.//a/div/img/@src').extract_first() item['Student'] = each.xpath('.//a/div/div/div/span/text()').extract()[1] item['Introduction'] = each.xpath('.//a/div/div/p/text()').extract_first() yield item #获取下一页的URL #url = response.xpath('//a[contains(text(),"下一页")]/@href').extract() #if url: #page = 'http://www.imooc.com' + url[0] #yield scrapy.Request(page,callback = self.parse)
参考资料:《精通Scrapy网络爬虫》
转载请注明来自:www.bywalks.com