scrapy itemloader example

def parse_question(self, response):
        # ??question??? ??????????question item
        if "QuestionHeader-title" in response.text:
            # ?????
            match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url)
            if match_obj:
                question_id = int(match_obj.group(2))

            item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
            item_loader.add_css("title", "h1.QuestionHeader-title::text")
            item_loader.add_css("content", ".QuestionHeader-detail")
            item_loader.add_value("url", response.url)
            item_loader.add_value("zhihu_id", question_id)
            item_loader.add_css("answer_num", ".List-headerText span::text")
            item_loader.add_css("comments_num", ".QuestionHeader-Comment button::text")
            item_loader.add_css("watch_user_num", ".NumberBoard-value::text")
            item_loader.add_css("topics", ".QuestionHeader-topics .Popover div::text")

            question_item = item_loader.load_item()
        else:
            # ????????item??
            match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url)

            if match_obj:
                question_id = int(match_obj.group(2))

            item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
            # item_loader.add_css("title", ".zh-question-title h2 a::text")
            item_loader.add_xpath("title",
                                  "//*[@id='zh-question-title']/h2/a/text()|//*[@id='zh-question-title']/h2/span/text()")
            item_loader.add_css("content", "#zh-question-detail")
            item_loader.add_value("url", response.url)
            item_loader.add_value("zhihu_id", question_id)
            item_loader.add_css("answer_num", "#zh-question-answer-num::text")
            item_loader.add_css("comments_num", "#zh-question-meta-wrap a[name='addcomment']::text")
            # item_loader.add_css("watch_user_num", "#zh-question-side-header-wrap::text")
            item_loader.add_xpath("watch_user_num",
                                  "//*[@id='zh-question-side-header-wrap']/text()|//*[@class='zh-question-followers-sidebar']/div/a/strong/text()")
            item_loader.add_css("topics", ".zm-tag-editor-labels a::text")

            question_item = item_loader.load_item()

        yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0), headers=self.headers,
                             callback=self.parse_answer)
        yield question_item

View another examples Add Own solution

3.5

Nick Zafiridis 95 points

                                    def parse_news(self, response):
        self.logger.info('parse_news: %s' % response)

        # Initialize item loader
        # extract news title, published_at, author, content, url
        loader = ItemLoader(item=News(), response=response)
        loader.add_value('url', response.url)

        title_selectors = response.css('h1[itemprop=&quot;headline&quot;]::text')
        if not title_selectors:
            # Will be dropped on the item pipeline
            return loader.load_item()
        title = title_selectors.extract()[0]
        loader.add_value('title', title)


        author_name_selectors = response.css('a[rel=&quot;author&quot;] &gt; span::text')
        if not author_name_selectors:
            loader.add_value('author_name', '')
        else:
            author_name = author_name_selectors.extract()[0]
            loader.add_value('author_name', author_name)

        raw_content_selectors = response.css('.content')
        if not raw_content_selectors:
            # Will be dropped on the item pipeline
            return loader.load_item()
        raw_content = raw_content_selectors.extract()
        raw_content = ' '.join([w.strip() for w in raw_content])
        raw_content = raw_content.strip()
        loader.add_value('raw_content', raw_content)

        date_time_str_selectors = response.css('article &gt; div.time::text')
        if not date_time_str_selectors:
            # Will be dropped on the item pipeline
            return loader.load_item()

        # Parse date information
        # Example: Selasa,&nbsp; 6 Oktober 2015 - 05:23 WIB
        date_time_str = date_time_str_selectors.extract()[0]
        date_time_str = date_time_str.split(',')[1].strip()[:-4]
        date_time_str = ' '.join([_(w) for w in date_time_str.split(' ')])
        try:
            published_at_wib = datetime.strptime(date_time_str, '%d %B %Y - %H:%M')
        except ValueError:
            # Will be dropped on the item pipeline
            return loader.load_item()
        published_at = wib_to_utc(published_at_wib)
        loader.add_value('published_at', published_at)

        # Move scraped news to pipeline
        return loader.load_item()

Thank you! 8

3.5 (8 Votes)

Jader 95 points

                                    def parse(self, response):
        l = ItemLoader(item=PlantItem(), response=response)

        l.add_xpath('name', &quot;//div[@id='bodycontent']/div[@class='post']/div[@class='pagebanner']/h2/text()&quot;)
        l.add_xpath('species', &quot;//div[@id='bodycontent']/div[@class='post']/div[@class='pagebanner']/div[@class='clear resultSpecies']/text()&quot;)
        l.add_xpath('key', &quot;//div[@id='bodycontent']/div[@class='post']/div[@class='contents']/div[@id='tabbedinfo']/div[@class='tabscontain']/div[@class='tabs']/div[@class='post-meta']/div[@class='post-meta-key']/text()&quot;)
        l.add_xpath('value', &quot;//div[@id='bodycontent']/div[@class='post']/div[@class='contents']/div[@id='tabbedinfo']/div[@class='tabscontain']/div[@class='tabs']/div[@class='post-meta']/div[@class='post-meta-value']/child::node()&quot;)
        # l.add_xpath('value', &quot;//div[@id='bodycontent']/div[@class='post']/div[@class='contents']/div[@id='tabbedinfo']/div[@class='tabscontain']/div[@class='tabs']/div[@class='post-meta']/div[@class='post-meta-value']/a/text()&quot;)

        return l.load_item()

Thank you! 7

4 (6 Votes)

4.14

Ocis 105 points

                                    def parse_item(self, response):
        &quot;&quot;&quot;
        Extract fields from the individual email page and load them into the
        item.

        @url http://lkml.iu.edu/hypermail/linux/kernel/0111.3/0036.html
        @returns items 1 1
        @scrapes senderName senderEmail timeSent timeReceived subject body
        @scrapes replyto url
        &quot;&quot;&quot;

        load = ItemLoader(item=Email(), selector=response)

        # Take care of easy fields first
        load.add_value('url', response.url)

        pattern_replyto = '//ul[1]/li[contains((b|strong), &quot;In reply to:&quot;)]'
        pattern_replyto += '/a/@href'
        link = response.xpath(pattern_replyto).extract()
        link = [''] if not link else link

        load.add_value('replyto', link[0])

        # Sometime in 2003, the archive changes and the email pages
        # require specific procedure to extract the following fields:
        specific_fields = {
            'senderName': None,
            'senderEmail': None,
            'timeSent': None,
            'timeReceived': None,
            'subject': None
        }

        # Detect new archive system with HTML comment
        new_system = response.xpath('/comment()[1][contains(., &quot;MHonArc&quot;)]')

        if len(new_system) &gt;= 1:
            # If new archive system is detected...
            specific_fields = self.parse_new_system(response, specific_fields)
            body_before_comment = '&lt;!--X-Body-of-Message--&gt;'
            body_after_comment = '&lt;!--X-Body-of-Message-End--&gt;'
        else:
            # Otherwise...
            specific_fields = self.parse_old_system(response, specific_fields)
            body_before_comment = '&lt;!-- body=&quot;start&quot; --&gt;'
            body_after_comment = '&lt;!-- body=&quot;end&quot; --&gt;'

        # Load all the values from these specific fields
        for key, val in specific_fields.items():
            load.add_value(key, val)

        if self.get_body:
            # Final field, the body of the email
            pattern_body = body_before_comment + '\n?(.*)' + body_after_comment

            # Ignore invalid bytes when necessary
            page_body = response.body.decode('utf-8', 'ignore')
            body = re.search(pattern_body, page_body, flags=re.S)
            load.add_value('body', body.group(1))

        return load.load_item()

Thank you! 7

4.14 (7 Votes)

4.17

Bataille Jacques 115 points

                                    def parse_item(self, response):
		l = ItemLoader(item=PageItem(), response=response)
		l.add_value('title', response.request.cookies['title'])
		l.add_value('name', self.name)
		l.add_value('url', response.url)
		l.add_xpath('image_urls', '//td[@valign=&quot;top&quot;]/img/@src')
		return l.load_item()

Thank you! 6

4.17 (6 Votes)

4.27

Nathan Hughes 125 points

                                    def parse_book(self, response):
        book_loader = ItemLoader(item=BookItem(), response=response)
        book_loader.default_input_processor = MapCompose(remove_tags)

        book_loader.add_value(&quot;image_urls&quot;, response.urljoin(response.css(&quot;.item.active &gt; img::attr(src)&quot;).extract_first()))

        book_loader.add_css(&quot;title&quot;, &quot;.col-sm-6.product_main &gt; h1&quot;, TakeFirst())
        book_loader.add_css(&quot;price&quot;, &quot;.price_color&quot;, TakeFirst())
        book_loader.add_css(&quot;upc&quot;, &quot;.table.table-striped &gt; tr:nth-child(1) &gt; td&quot;, TakeFirst())
        book_loader.add_css(&quot;product_type&quot;, &quot;.table.table-striped &gt; tr:nth-child(2) &gt; td&quot;, TakeFirst())
        book_loader.add_css(&quot;tax&quot;, &quot;.table.table-striped &gt; tr:nth-child(5) &gt; td&quot;, TakeFirst())
        book_loader.add_css(&quot;stock&quot;, &quot;.table.table-striped &gt; tr:nth-child(6) &gt; td&quot;, TakeFirst())
        book_loader.add_css(&quot;reviews&quot;, &quot;.table.table-striped &gt; tr:nth-child(7) &gt; td&quot;, TakeFirst())
        book_loader.add_css(&quot;rating&quot;, &quot;.star-rating::attr(class)&quot;, TakeFirst())
        return book_loader.load_item()

Thank you! 10

4.27 (11 Votes)

4.11

Local Ghanian 120 points

                                    def parse(self, response):
        l = ItemLoader(item=Area(), response=response)
        l.add_value('id', parse_qs(response.xpath('//div[@class=&quot;clearfix subnav level-1&quot;]//li//a[2]/@href').extract()[0])['area_id'][0])
        l.add_xpath('name', '//div[@class=&quot;clearfix subnav level-1&quot;]//li//a[2]/text()')
        l.add_value('updated', datetime.utcnow().isoformat()) # you can also use literal values
        return l.load_item()
        #self.log('URL: {}'.format(response.url))

Thank you! 9

4.11 (9 Votes)

Sumit Kumar Karn 85 points

                                    def parse(self, response):

        for outer in response.css('#comapreTable tr:not(:first-child)'):

            if outer.css('td[align=&quot;center&quot;]'):
                ccode = outer.css('td[align=&quot;center&quot;]&gt;a::attr(id)').extract_first()
                cname = outer.css('td[align=&quot;center&quot;]&gt;a::text').extract_first()

            for inner in outer.xpath('td[div[@align=&quot;left&quot;]/a]'):
                loader = ItemLoader(item=EolZhuanyeItem(), selector=inner)
                loader.add_value('ccode', ccode)
                loader.add_value('cname', cname)
                loader.add_css('url', 'a::attr(href)', lambda urls: urljoin(self.start_urls[0], urls[0]))
                loader.add_xpath('code', 'following-sibling::td[1]/text()', MapCompose(unicode.strip))
                loader.add_css('name', 'a::text', MapCompose(unicode.strip))
                item = loader.load_item()

                yield Request(url=item['url'][0], meta={'item': item}, callback=self.parse_item)

Thank you! 2

3 (2 Votes)

Fitzroy Prince 105 points

                                    def parse_song_list(self, response):
        selector = Selector(response)

        song_name_list = selector.xpath('//body//ul[@class=&quot;f-hide&quot;]/li/a/text()').extract()
        song_id_list = selector.xpath('//body//ul[@class=&quot;f-hide&quot;]/li/a/@href').extract()
        title = selector.xpath('//title/text()').extract()
        for index, id_ in enumerate(song_id_list):
            l = ItemLoader(item=PlayListItem())
            l.add_value('song_name', song_name_list[index])
            l.add_value('title', title)
            yield scrapy.FormRequest(url=self.BASE_URL + id_, meta={'song_id': id_[9:], 'loader': l}, method='GET',
                                     headers=self.headers, callback=self.parse_single_song)

Thank you! 7

4.25

John Anderson 85 points

                                    def parse_question(self, response):
        question_pattern = re.compile('(.*zhihu.com/question/(\d+))(/|$).*')
        match_object = re.match(question_pattern, response.url)
        question_id = match_object.group(2)
        item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
        item_loader.add_value('zhihu_id', question_id)
        item_loader.add_css('title', 'h1.QuestionHeader-title::text')
        item_loader.add_css('topics', '.TopicLink .Popover div::text')
        item_loader.add_value('url', response.url)
        item_loader.add_css('content', '.QuestionHeader-detail div div span::text')
        item_loader.add_css('answer_num', '.List-headerText span::text')
        item_loader.add_css('comments_num', '.QuestionHeader-Comment button::text')
        item_loader.add_css('watch_user_num', '.NumberBoard-value::text')

        item = item_loader.load_item()
        yield item
        yield scrapy.Request(self.start_answer_url.format(question_id=question_id, offset=0, limit=20),
                             headers=self.headers, callback=self.parse_answer)

Thank you! 4

4.25 (4 Votes)

Are there any code examples left?

Find Add Code snippet

New code examples in category Other

Other 2023-03-27 22:50:10 how to select the whole line in vscode with keyboard shortcut
Other 2022-03-27 22:45:24 income of a web developer
Other 2022-03-27 22:35:01 \pyrcc_main.py: File does not exist 'resources.qrc'
Other 2022-03-27 22:30:45 rick roll embed code
Other 2022-03-27 22:20:08 Circuit_04_Potentiometer
Other 2022-03-27 22:20:05 iterative power
Other 2022-03-27 22:15:11 flutter run all
Other 2022-03-27 22:10:05 when is karlson release
Other 2022-03-27 22:10:02 wp .htaccess example
Other 2022-03-27 22:00:08 bash pause in file read line by line

Create a Free Account

Unlock the power of data and AI by diving into Python, ChatGPT, SQL, Power BI, and beyond.

Develop soft skills on BrainApps

Complete the IQ Test

scrapy itemloader example

Welcome Back!

Create a Free Account