diff --git a/sources/academy/tutorials/node_js/dealing_with_dynamic_pages.md b/sources/academy/tutorials/node_js/dealing_with_dynamic_pages.md index db8311841d..72a84e00ac 100644 --- a/sources/academy/tutorials/node_js/dealing_with_dynamic_pages.md +++ b/sources/academy/tutorials/node_js/dealing_with_dynamic_pages.md @@ -144,7 +144,6 @@ So, we've gotta scroll down the page to load these images. Luckily, because we'r ```js import { PuppeteerCrawler, utils, Dataset } from 'crawlee'; -import cheerio from 'cheerio'; const BASE_URL = 'https://demo-webstore.apify.org/'; diff --git a/sources/academy/webscraping/puppeteer_playwright/common_use_cases/paginating_through_results.md b/sources/academy/webscraping/puppeteer_playwright/common_use_cases/paginating_through_results.md index 80029df96a..b02b5638f9 100644 --- a/sources/academy/webscraping/puppeteer_playwright/common_use_cases/paginating_through_results.md +++ b/sources/academy/webscraping/puppeteer_playwright/common_use_cases/paginating_through_results.md @@ -16,7 +16,7 @@ import TabItem from '@theme/TabItem'; If you're trying to [collect data](../executing_scripts/extracting_data.md) on a website that has millions, thousands, or even just hundreds of results, it is very likely that they are paginating their results to reduce strain on their backend as well as on the users loading and rendering the content. -![Amazon pagination](https://apify-docs.s3.amazonaws.com/master/docs/assets/tutorials/images/pagination.jpg) +![Amazon pagination](../../advanced_web_scraping/images/pagination.png) Attempting to scrape thousands to tens of thousands of results using a headless browser on a website that only shows 30 results at a time might be daunting at first, but be rest assured that by the end of this lesson you'll feel confident when faced with this use case. @@ -53,7 +53,6 @@ Let's grab this number now with a little bit of code: ```javascript import { chromium } from 'playwright'; -import { load } from 'cheerio'; const repositories = []; @@ -79,7 +78,6 @@ await browser.close(); ```javascript import puppeteer from 'puppeteer'; -import { load } from 'cheerio'; const repositories = []; @@ -118,7 +116,7 @@ And since we're already on the first page, we'll go ahead and scrape the repos f ```javascript import { chromium } from 'playwright'; -import { load } from 'cheerio'; +import * as cheerio from 'cheerio'; const repositories = []; @@ -127,7 +125,7 @@ const REPOSITORIES_URL = `${BASE_URL}/orgs/facebook/repositories`; // Create a function which grabs all repos from a page const scrapeRepos = async (page) => { - const $ = load(await page.content()); + const $ = cheerio.load(await page.content()); return [...$('li.Box-row')].map((item) => { const elem = $(item); @@ -163,7 +161,7 @@ await browser.close(); ```javascript import puppeteer from 'puppeteer'; -import { load } from 'cheerio'; +import * as cheerio from 'cheerio'; const repositories = []; @@ -172,7 +170,7 @@ const REPOSITORIES_URL = `${BASE_URL}/orgs/facebook/repositories`; // Create a function which grabs all repos from a page const scrapeRepos = async (page) => { - const $ = load(await page.content()); + const $ = cheerio.load(await page.content()); return [...$('li.Box-row')].map((item) => { const elem = $(item); @@ -260,7 +258,7 @@ After all is said and done, here's what our final code looks like: ```javascript import { chromium } from 'playwright'; -import { load } from 'cheerio'; +import * as cheerio from 'cheerio'; const repositories = []; @@ -268,7 +266,7 @@ const BASE_URL = 'https://github.com/'; const REPOSITORIES_URL = `${BASE_URL}/orgs/facebook/repositories`; const scrapeRepos = async (page) => { - const $ = load(await page.content()); + const $ = cheerio.load(await page.content()); return [...$('li.Box-row')].map((item) => { const elem = $(item); @@ -321,7 +319,7 @@ await browser.close(); ```javascript import puppeteer from 'puppeteer'; -import { load } from 'cheerio'; +import * as cheerio from 'cheerio'; const repositories = []; @@ -330,7 +328,7 @@ const REPOSITORIES_URL = `${BASE_URL}/orgs/facebook/repositories`; // Create a function which grabs all repos from a page const scrapeRepos = async (page) => { - const $ = load(await page.content()); + const $ = cheerio.load(await page.content()); return [...$('li.Box-row')].map((item) => { const elem = $(item); @@ -402,7 +400,6 @@ We're going to scrape the brand and price from the first 75 results on the **Abo ```javascript import { chromium } from 'playwright'; -import { load } from 'cheerio'; // Create an array where all scraped products will // be pushed to @@ -421,7 +418,6 @@ await browser.close(); ```javascript import puppeteer from 'puppeteer'; -import { load } from 'cheerio'; // Create an array where all scraped products will // be pushed to @@ -543,7 +539,9 @@ Now, the `while` loop will exit out if we've reached the bottom of the page. Within the loop, we can grab hold of the total number of items on the page. To avoid extracting and pushing duplicate items to the **products** array, we can use the `.slice()` method to cut out the items we've already scraped. ```js -const $ = load(await page.content()); +import * as cheerio from 'cheerio'; + +const $ = cheerio.load(await page.content()); // Grab the newly loaded items const items = [...$('a[data-testid*="productTile"]')].slice(products.length); @@ -569,7 +567,7 @@ With everything completed, this is what we're left with: ```javascript import { chromium } from 'playwright'; -import { load } from 'cheerio'; +import * as cheerio from 'cheerio'; const products = []; @@ -592,7 +590,7 @@ while (products.length < 75) { // Allow the products 1 second to load await page.waitForTimeout(1000); - const $ = load(await page.content()); + const $ = cheerio.load(await page.content()); // Grab the newly loaded items const items = [...$('a[data-testid*="productTile"]')].slice(products.length); @@ -628,7 +626,7 @@ await browser.close(); ```javascript import puppeteer from 'puppeteer'; -import { load } from 'cheerio'; +import * as cheerio from 'cheerio'; const products = []; @@ -651,7 +649,7 @@ while (products.length < 75) { // Allow the products 1 second to load await page.waitForTimeout(1000); - const $ = load(await page.content()); + const $ = cheerio.load(await page.content()); // Grab the newly loaded items const items = [...$('a[data-testid*="productTile"]')].slice(products.length); diff --git a/sources/academy/webscraping/web_scraping_for_beginners/crawling/finding_links.md b/sources/academy/webscraping/web_scraping_for_beginners/crawling/finding_links.md index df92efafd9..3fc4d7538d 100644 --- a/sources/academy/webscraping/web_scraping_for_beginners/crawling/finding_links.md +++ b/sources/academy/webscraping/web_scraping_for_beginners/crawling/finding_links.md @@ -53,7 +53,7 @@ We'll start from a boilerplate that's very similar to the scraper we built in [B ```js title=crawler.js import { gotScraping } from 'got-scraping'; -import cheerio from 'cheerio'; +import * as cheerio from 'cheerio'; const storeUrl = 'https://warehouse-theme-metal.myshopify.com/collections/sales'; diff --git a/sources/academy/webscraping/web_scraping_for_beginners/crawling/first_crawl.md b/sources/academy/webscraping/web_scraping_for_beginners/crawling/first_crawl.md index e98b3eca19..588f4177fa 100644 --- a/sources/academy/webscraping/web_scraping_for_beginners/crawling/first_crawl.md +++ b/sources/academy/webscraping/web_scraping_for_beginners/crawling/first_crawl.md @@ -21,7 +21,7 @@ In the previous lessons, we collected and filtered all the URLs pointing to indi ```js title=crawler.js import { gotScraping } from 'got-scraping'; -import cheerio from 'cheerio'; +import * as cheerio from 'cheerio'; const WEBSITE_URL = 'https://warehouse-theme-metal.myshopify.com/'; const storeUrl = `${WEBSITE_URL}/collections/sales`; @@ -75,7 +75,7 @@ In programming, you handle errors by catching and handling them. Typically by pr ```js title=crawler.js import { gotScraping } from 'got-scraping'; -import cheerio from 'cheerio'; +import * as cheerio from 'cheerio'; const WEBSITE_URL = 'https://warehouse-theme-metal.myshopify.com/'; const storeUrl = `${WEBSITE_URL}/collections/sales`; diff --git a/sources/academy/webscraping/web_scraping_for_beginners/crawling/recap_extraction_basics.md b/sources/academy/webscraping/web_scraping_for_beginners/crawling/recap_extraction_basics.md index e1aebc9abd..a194274bb3 100644 --- a/sources/academy/webscraping/web_scraping_for_beginners/crawling/recap_extraction_basics.md +++ b/sources/academy/webscraping/web_scraping_for_beginners/crawling/recap_extraction_basics.md @@ -18,7 +18,7 @@ We finished off the [first section](../data_extraction/index.md) of the _Web Scr // download, extract, and convert the data we wanted import { writeFileSync } from 'fs'; import { gotScraping } from 'got-scraping'; -import cheerio from 'cheerio'; +import * as cheerio from 'cheerio'; import { parse } from 'json2csv'; // Here, we fetched the website's HTML and saved it to a new variable. diff --git a/sources/academy/webscraping/web_scraping_for_beginners/crawling/relative_urls.md b/sources/academy/webscraping/web_scraping_for_beginners/crawling/relative_urls.md index e191c5535a..8ab7b5e525 100644 --- a/sources/academy/webscraping/web_scraping_for_beginners/crawling/relative_urls.md +++ b/sources/academy/webscraping/web_scraping_for_beginners/crawling/relative_urls.md @@ -35,7 +35,7 @@ Let's update the Node.js code from the [Finding links lesson](./finding_links.md ```js title=crawler.js import { gotScraping } from 'got-scraping'; -import cheerio from 'cheerio'; +import * as cheerio from 'cheerio'; const storeUrl = 'https://warehouse-theme-metal.myshopify.com/collections/sales'; @@ -72,7 +72,7 @@ When we plug this into our crawler code, we will get the correct - absolute - UR ```js title=crawler.js import { gotScraping } from 'got-scraping'; -import cheerio from 'cheerio'; +import * as cheerio from 'cheerio'; // Split the base URL from the category to use it later. const WEBSITE_URL = 'https://warehouse-theme-metal.myshopify.com/'; diff --git a/sources/academy/webscraping/web_scraping_for_beginners/crawling/scraping_the_data.md b/sources/academy/webscraping/web_scraping_for_beginners/crawling/scraping_the_data.md index cfa36af6b0..9f2cb7bc22 100644 --- a/sources/academy/webscraping/web_scraping_for_beginners/crawling/scraping_the_data.md +++ b/sources/academy/webscraping/web_scraping_for_beginners/crawling/scraping_the_data.md @@ -21,7 +21,7 @@ Let's start writing a script that extracts data from this single PDP. We can use ```js title=product.js import { gotScraping } from 'got-scraping'; -import cheerio from 'cheerio'; +import * as cheerio from 'cheerio'; const productUrl = 'https://warehouse-theme-metal.myshopify.com/products/denon-ah-c720-in-ear-headphones'; const response = await gotScraping(productUrl); @@ -123,7 +123,7 @@ Let's compare the above data extraction example with the crawling code we wrote ```js title=crawler.js import { gotScraping } from 'got-scraping'; -import cheerio from 'cheerio'; +import * as cheerio from 'cheerio'; const WEBSITE_URL = 'https://warehouse-theme-metal.myshopify.com/'; const storeUrl = `${WEBSITE_URL}/collections/sales`; @@ -171,7 +171,7 @@ We'll start by adding our imports and constants at the top of the file, no chang ```js title=final.js import { gotScraping } from 'got-scraping'; -import cheerio from 'cheerio'; +import * as cheerio from 'cheerio'; const WEBSITE_URL = 'https://warehouse-theme-metal.myshopify.com/'; ``` diff --git a/sources/academy/webscraping/web_scraping_for_beginners/data_extraction/node_continued.md b/sources/academy/webscraping/web_scraping_for_beginners/data_extraction/node_continued.md index 48e37c70a6..70dd10655f 100644 --- a/sources/academy/webscraping/web_scraping_for_beginners/data_extraction/node_continued.md +++ b/sources/academy/webscraping/web_scraping_for_beginners/data_extraction/node_continued.md @@ -38,7 +38,7 @@ Replace the code in your **main.js** with the following, and run it with `node m ```js // main.js import { gotScraping } from 'got-scraping'; -import cheerio from 'cheerio'; +import * as cheerio from 'cheerio'; const storeUrl = 'https://warehouse-theme-metal.myshopify.com/collections/sales'; @@ -110,7 +110,7 @@ The final scraper code looks like this. Replace the code in your **main.js** fil ```js // main.js import { gotScraping } from 'got-scraping'; -import cheerio from 'cheerio'; +import * as cheerio from 'cheerio'; const storeUrl = 'https://warehouse-theme-metal.myshopify.com/collections/sales'; diff --git a/sources/academy/webscraping/web_scraping_for_beginners/data_extraction/node_js_scraper.md b/sources/academy/webscraping/web_scraping_for_beginners/data_extraction/node_js_scraper.md index 4cc7319eb3..97215da452 100644 --- a/sources/academy/webscraping/web_scraping_for_beginners/data_extraction/node_js_scraper.md +++ b/sources/academy/webscraping/web_scraping_for_beginners/data_extraction/node_js_scraper.md @@ -43,7 +43,7 @@ To parse the HTML with the `cheerio` library. Replace the code in your **main.js ```js // main.js import { gotScraping } from 'got-scraping'; -import cheerio from 'cheerio'; +import * as cheerio from 'cheerio'; const storeUrl = 'https://warehouse-theme-metal.myshopify.com/collections/sales'; diff --git a/sources/academy/webscraping/web_scraping_for_beginners/data_extraction/project_setup.md b/sources/academy/webscraping/web_scraping_for_beginners/data_extraction/project_setup.md index d82160a2d7..888fc718dc 100644 --- a/sources/academy/webscraping/web_scraping_for_beginners/data_extraction/project_setup.md +++ b/sources/academy/webscraping/web_scraping_for_beginners/data_extraction/project_setup.md @@ -57,7 +57,7 @@ With the libraries installed, create a new file in the project's folder called * ```js import gotScraping from 'got-scraping'; -import cheerio from 'cheerio'; +import * as cheerio from 'cheerio'; console.log('it works!'); ``` diff --git a/sources/academy/webscraping/web_scraping_for_beginners/data_extraction/save_to_csv.md b/sources/academy/webscraping/web_scraping_for_beginners/data_extraction/save_to_csv.md index faa19a9c76..f44eca7516 100644 --- a/sources/academy/webscraping/web_scraping_for_beginners/data_extraction/save_to_csv.md +++ b/sources/academy/webscraping/web_scraping_for_beginners/data_extraction/save_to_csv.md @@ -40,7 +40,7 @@ The full code including the earlier scraping part now looks like this. Replace t ```js // main.js import { gotScraping } from 'got-scraping'; -import cheerio from 'cheerio'; +import * as cheerio from 'cheerio'; import { parse } from 'json2csv'; // <---- added a new import const storeUrl = 'https://warehouse-theme-metal.myshopify.com/collections/sales'; @@ -105,7 +105,7 @@ When we complete the code, it looks like this. Replace the code in your **main.j // main.js import { writeFileSync } from 'fs'; // <---- added a new import import { gotScraping } from 'got-scraping'; -import cheerio from 'cheerio'; +import * as cheerio from 'cheerio'; import { parse } from 'json2csv'; const storeUrl = 'https://warehouse-theme-metal.myshopify.com/collections/sales'; diff --git a/sources/platform/actors/development/actor_definition/actor_json.md b/sources/platform/actors/development/actor_definition/actor_json.md index 047b4ba4fe..961ee7191c 100644 --- a/sources/platform/actors/development/actor_definition/actor_json.md +++ b/sources/platform/actors/development/actor_definition/actor_json.md @@ -61,8 +61,8 @@ import TabItem from '@theme/TabItem'; > Note that actor `name`, `version`, `buildTag`, and `environmentVariables` are currently only used when you deploy your actor using [Apify CLI](/cli) and not when deployed, for example, via GitHub integration. There it serves for informative purposes only. This is suspected to change in the future. -| Property | Type | Description | -| ---------------- | -------- |----------- | +| Property | Type | Description | +|------------------------| -------- |----------- | | `actorSpecification` | Required | We are at a version `1` which is the only one available so this must be set to `1`. | | `name` | Required | Name of the Actor. | | `version` | Required | Actor version in the form `[Number].[Number]`, i.e. for example `0.0`, `0.1`, `2.3`, ... | @@ -72,6 +72,6 @@ import TabItem from '@theme/TabItem'; | `dockerContextDir` | Optional | Specifies the path to the directory used as the Docker context when building the Actor. The path is relative to the location of the `actor.json` file. Useful for having a monorepo with multiple Actors. See [Actor monorepos](../deployment/source_types.md#actor-monorepos) for more details. | | `readme` | Optional | If you specify the path to your README file under the `readme` field, the README at this path will be used on the platform. If not specified, README at `.actor/README.md` or `README.md` will be used, in this order of preference. See our [Apify Academy article on writing a quality README files](/academy/get-most-of-actors/actor-readme). | | `input` | Optional | You can embed your [input schema](./input_schema/index.md) object directly in `actor.json` under the `input` field. Alternatively, you can provide a path to a custom input schema. If not provided, the input schema at `.actor/INPUT_SCHEMA.json` or `INPUT_SCHEMA.json` is used, in this order of preference. | -`storages.dataset` | Optional | You can define the schema of the items in your dataset under the `storages.dataset` field. This can be either an embedded object or a path to a JSON schema file. [Read more](./output_schema.md#specification-version-1) about Actor output schemas. | +| `storages.dataset` | Optional | You can define the schema of the items in your dataset under the `storages.dataset` field. This can be either an embedded object or a path to a JSON schema file. [Read more](./output_schema.md#specification-version-1) about Actor output schemas. | | `minMemoryMbytes` | Optional | Specifies the minimum amount of memory in megabytes that an Actor requires to run. Requires an integer value. If both `minMemoryMbytes` and `maxMemoryMbytes` are set, then `minMemoryMbytes` must be the same or lower than `maxMemoryMbytes`. | | `maxMemoryMbytes` | Optional | Specifies the maximum amount of memory in megabytes that an Actor requires to run. It can be used to control the costs of run, especially when developing pay per result actors. Requires an integer value. |