scrapy itemloader example
def parse_question(self, response):
# ??question??? ??????????question item
if "QuestionHeader-title" in response.text:
# ?????
match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url)
if match_obj:
question_id = int(match_obj.group(2))
item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
item_loader.add_css("title", "h1.QuestionHeader-title::text")
item_loader.add_css("content", ".QuestionHeader-detail")
item_loader.add_value("url", response.url)
item_loader.add_value("zhihu_id", question_id)
item_loader.add_css("answer_num", ".List-headerText span::text")
item_loader.add_css("comments_num", ".QuestionHeader-Comment button::text")
item_loader.add_css("watch_user_num", ".NumberBoard-value::text")
item_loader.add_css("topics", ".QuestionHeader-topics .Popover div::text")
question_item = item_loader.load_item()
else:
# ????????item??
match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url)
if match_obj:
question_id = int(match_obj.group(2))
item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
# item_loader.add_css("title", ".zh-question-title h2 a::text")
item_loader.add_xpath("title",
"//*[@id='zh-question-title']/h2/a/text()|//*[@id='zh-question-title']/h2/span/text()")
item_loader.add_css("content", "#zh-question-detail")
item_loader.add_value("url", response.url)
item_loader.add_value("zhihu_id", question_id)
item_loader.add_css("answer_num", "#zh-question-answer-num::text")
item_loader.add_css("comments_num", "#zh-question-meta-wrap a[name='addcomment']::text")
# item_loader.add_css("watch_user_num", "#zh-question-side-header-wrap::text")
item_loader.add_xpath("watch_user_num",
"//*[@id='zh-question-side-header-wrap']/text()|//*[@class='zh-question-followers-sidebar']/div/a/strong/text()")
item_loader.add_css("topics", ".zm-tag-editor-labels a::text")
question_item = item_loader.load_item()
yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0), headers=self.headers,
callback=self.parse_answer)
yield question_item
3.5
8
def parse_news(self, response):
self.logger.info('parse_news: %s' % response)
# Initialize item loader
# extract news title, published_at, author, content, url
loader = ItemLoader(item=News(), response=response)
loader.add_value('url', response.url)
title_selectors = response.css('h1[itemprop="headline"]::text')
if not title_selectors:
# Will be dropped on the item pipeline
return loader.load_item()
title = title_selectors.extract()[0]
loader.add_value('title', title)
author_name_selectors = response.css('a[rel="author"] > span::text')
if not author_name_selectors:
loader.add_value('author_name', '')
else:
author_name = author_name_selectors.extract()[0]
loader.add_value('author_name', author_name)
raw_content_selectors = response.css('.content')
if not raw_content_selectors:
# Will be dropped on the item pipeline
return loader.load_item()
raw_content = raw_content_selectors.extract()
raw_content = ' '.join([w.strip() for w in raw_content])
raw_content = raw_content.strip()
loader.add_value('raw_content', raw_content)
date_time_str_selectors = response.css('article > div.time::text')
if not date_time_str_selectors:
# Will be dropped on the item pipeline
return loader.load_item()
# Parse date information
# Example: Selasa, 6 Oktober 2015 - 05:23 WIB
date_time_str = date_time_str_selectors.extract()[0]
date_time_str = date_time_str.split(',')[1].strip()[:-4]
date_time_str = ' '.join([_(w) for w in date_time_str.split(' ')])
try:
published_at_wib = datetime.strptime(date_time_str, '%d %B %Y - %H:%M')
except ValueError:
# Will be dropped on the item pipeline
return loader.load_item()
published_at = wib_to_utc(published_at_wib)
loader.add_value('published_at', published_at)
# Move scraped news to pipeline
return loader.load_item()
Thank you!
8
0
4
7
def parse(self, response):
l = ItemLoader(item=PlantItem(), response=response)
l.add_xpath('name', "//div[@id='bodycontent']/div[@class='post']/div[@class='pagebanner']/h2/text()")
l.add_xpath('species', "//div[@id='bodycontent']/div[@class='post']/div[@class='pagebanner']/div[@class='clear resultSpecies']/text()")
l.add_xpath('key', "//div[@id='bodycontent']/div[@class='post']/div[@class='contents']/div[@id='tabbedinfo']/div[@class='tabscontain']/div[@class='tabs']/div[@class='post-meta']/div[@class='post-meta-key']/text()")
l.add_xpath('value', "//div[@id='bodycontent']/div[@class='post']/div[@class='contents']/div[@id='tabbedinfo']/div[@class='tabscontain']/div[@class='tabs']/div[@class='post-meta']/div[@class='post-meta-value']/child::node()")
# l.add_xpath('value', "//div[@id='bodycontent']/div[@class='post']/div[@class='contents']/div[@id='tabbedinfo']/div[@class='tabscontain']/div[@class='tabs']/div[@class='post-meta']/div[@class='post-meta-value']/a/text()")
return l.load_item()
Thank you!
7
0
4.14
7
def parse_item(self, response):
"""
Extract fields from the individual email page and load them into the
item.
@url http://lkml.iu.edu/hypermail/linux/kernel/0111.3/0036.html
@returns items 1 1
@scrapes senderName senderEmail timeSent timeReceived subject body
@scrapes replyto url
"""
load = ItemLoader(item=Email(), selector=response)
# Take care of easy fields first
load.add_value('url', response.url)
pattern_replyto = '//ul[1]/li[contains((b|strong), "In reply to:")]'
pattern_replyto += '/a/@href'
link = response.xpath(pattern_replyto).extract()
link = [''] if not link else link
load.add_value('replyto', link[0])
# Sometime in 2003, the archive changes and the email pages
# require specific procedure to extract the following fields:
specific_fields = {
'senderName': None,
'senderEmail': None,
'timeSent': None,
'timeReceived': None,
'subject': None
}
# Detect new archive system with HTML comment
new_system = response.xpath('/comment()[1][contains(., "MHonArc")]')
if len(new_system) >= 1:
# If new archive system is detected...
specific_fields = self.parse_new_system(response, specific_fields)
body_before_comment = '<!--X-Body-of-Message-->'
body_after_comment = '<!--X-Body-of-Message-End-->'
else:
# Otherwise...
specific_fields = self.parse_old_system(response, specific_fields)
body_before_comment = '<!-- body="start" -->'
body_after_comment = '<!-- body="end" -->'
# Load all the values from these specific fields
for key, val in specific_fields.items():
load.add_value(key, val)
if self.get_body:
# Final field, the body of the email
pattern_body = body_before_comment + '\n?(.*)' + body_after_comment
# Ignore invalid bytes when necessary
page_body = response.body.decode('utf-8', 'ignore')
body = re.search(pattern_body, page_body, flags=re.S)
load.add_value('body', body.group(1))
return load.load_item()
Thank you!
7
0
4.17
6
def parse_item(self, response):
l = ItemLoader(item=PageItem(), response=response)
l.add_value('title', response.request.cookies['title'])
l.add_value('name', self.name)
l.add_value('url', response.url)
l.add_xpath('image_urls', '//td[@valign="top"]/img/@src')
return l.load_item()
Thank you!
6
0
4.27
10
def parse_book(self, response):
book_loader = ItemLoader(item=BookItem(), response=response)
book_loader.default_input_processor = MapCompose(remove_tags)
book_loader.add_value("image_urls", response.urljoin(response.css(".item.active > img::attr(src)").extract_first()))
book_loader.add_css("title", ".col-sm-6.product_main > h1", TakeFirst())
book_loader.add_css("price", ".price_color", TakeFirst())
book_loader.add_css("upc", ".table.table-striped > tr:nth-child(1) > td", TakeFirst())
book_loader.add_css("product_type", ".table.table-striped > tr:nth-child(2) > td", TakeFirst())
book_loader.add_css("tax", ".table.table-striped > tr:nth-child(5) > td", TakeFirst())
book_loader.add_css("stock", ".table.table-striped > tr:nth-child(6) > td", TakeFirst())
book_loader.add_css("reviews", ".table.table-striped > tr:nth-child(7) > td", TakeFirst())
book_loader.add_css("rating", ".star-rating::attr(class)", TakeFirst())
return book_loader.load_item()
Thank you!
10
0
4.11
9
def parse(self, response):
l = ItemLoader(item=Area(), response=response)
l.add_value('id', parse_qs(response.xpath('//div[@class="clearfix subnav level-1"]//li//a[2]/@href').extract()[0])['area_id'][0])
l.add_xpath('name', '//div[@class="clearfix subnav level-1"]//li//a[2]/text()')
l.add_value('updated', datetime.utcnow().isoformat()) # you can also use literal values
return l.load_item()
#self.log('URL: {}'.format(response.url))
Thank you!
9
0
3
2
def parse(self, response):
for outer in response.css('#comapreTable tr:not(:first-child)'):
if outer.css('td[align="center"]'):
ccode = outer.css('td[align="center"]>a::attr(id)').extract_first()
cname = outer.css('td[align="center"]>a::text').extract_first()
for inner in outer.xpath('td[div[@align="left"]/a]'):
loader = ItemLoader(item=EolZhuanyeItem(), selector=inner)
loader.add_value('ccode', ccode)
loader.add_value('cname', cname)
loader.add_css('url', 'a::attr(href)', lambda urls: urljoin(self.start_urls[0], urls[0]))
loader.add_xpath('code', 'following-sibling::td[1]/text()', MapCompose(unicode.strip))
loader.add_css('name', 'a::text', MapCompose(unicode.strip))
item = loader.load_item()
yield Request(url=item['url'][0], meta={'item': item}, callback=self.parse_item)
Thank you!
2
0
0
7
def parse_song_list(self, response):
selector = Selector(response)
song_name_list = selector.xpath('//body//ul[@class="f-hide"]/li/a/text()').extract()
song_id_list = selector.xpath('//body//ul[@class="f-hide"]/li/a/@href').extract()
title = selector.xpath('//title/text()').extract()
for index, id_ in enumerate(song_id_list):
l = ItemLoader(item=PlayListItem())
l.add_value('song_name', song_name_list[index])
l.add_value('title', title)
yield scrapy.FormRequest(url=self.BASE_URL + id_, meta={'song_id': id_[9:], 'loader': l}, method='GET',
headers=self.headers, callback=self.parse_single_song)
Thank you!
7
0
4.25
4
def parse_question(self, response):
question_pattern = re.compile('(.*zhihu.com/question/(\d+))(/|$).*')
match_object = re.match(question_pattern, response.url)
question_id = match_object.group(2)
item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
item_loader.add_value('zhihu_id', question_id)
item_loader.add_css('title', 'h1.QuestionHeader-title::text')
item_loader.add_css('topics', '.TopicLink .Popover div::text')
item_loader.add_value('url', response.url)
item_loader.add_css('content', '.QuestionHeader-detail div div span::text')
item_loader.add_css('answer_num', '.List-headerText span::text')
item_loader.add_css('comments_num', '.QuestionHeader-Comment button::text')
item_loader.add_css('watch_user_num', '.NumberBoard-value::text')
item = item_loader.load_item()
yield item
yield scrapy.Request(self.start_answer_url.format(question_id=question_id, offset=0, limit=20),
headers=self.headers, callback=self.parse_answer)
Thank you!
4
0
Are there any code examples left?
New code examples in category Other