What will be scraped
Preparation
First, we need to create a Node.js* project and add npm
packages puppeteer
, puppeteer-extra
and puppeteer-extra-plugin-stealth
to control Chromium (or Chrome, or Firefox, but now we work only with Chromium which is used by default) over the DevTools Protocol in headless or non-headless mode.
To do this, in the directory with our project, open the command line and enter npm init -y
, and then npm i puppeteer puppeteer-extra puppeteer-extra-plugin-stealth
.
*If you don't have Node.js installed, you can download it from nodejs.org and follow the installation documentation.
📌Note: also, you can use puppeteer
without any extensions, but I strongly recommended use it with puppeteer-extra
with puppeteer-extra-plugin-stealth
to prevent website detection that you are using headless Chromium or that you are using web driver. You can check it on Chrome headless tests website. The screenshot below shows you a difference.
Process
SelectorGadget Chrome extension was used to grab CSS selectors by clicking on the desired element in the browser. If you have any struggles understanding this, we have a dedicated Web Scraping with CSS Selectors blog post at SerpApi.
The Gif below illustrates the approach of selecting different parts of the results.
Full code
const puppeteer = require("puppeteer-extra");
const StealthPlugin = require("puppeteer-extra-plugin-stealth");
puppeteer.use(StealthPlugin());
const searchString = "playstation"; // what we want to search
const pagesLimit = 10; // limit of pages for getting info
let currentPage = 1; // current page of the search
const URL = "https://www.ebay.com";
async function getPageResults(page) {
const pageResults = await page.evaluate(function () {
return Array.from(document.querySelectorAll("ul .s-item__wrapper")).map((el) => ({
link: el.querySelector(".s-item__link").getAttribute("href"),
title: el.querySelector(".s-item__title").textContent.trim(),
condition: el.querySelector(".SECONDARY_INFO")?.textContent.trim() || "No condition data",
price: el.querySelector(".s-item__price")?.textContent.trim() || "No price data",
shipping: el.querySelector(".s-item__shipping")?.textContent.trim() || "No shipping data",
thumbnail: el.querySelector(".s-item__image-img")?.getAttribute("src") || "No image",
}));
});
return pageResults;
}
async function getOrganicResults() {
const browser = await puppeteer.launch({
headless: false,
args: ["--no-sandbox", "--disable-setuid-sandbox"],
});
const page = await browser.newPage();
await page.setDefaultNavigationTimeout(60000);
await page.goto(URL);
await page.waitForSelector("#gh-ac");
await page.focus("#gh-ac");
await page.keyboard.type(searchString);
await page.waitForTimeout(1000);
await page.click("#gh-btn");
const organicResults = [];
while (true) {
await page.waitForSelector(".srp-results");
const isNextPage = await page.$(".pagination__next");
if (!isNextPage || currentPage > pagesLimit) break;
organicResults.push(...(await getPageResults(page)));
await page.click(".pagination__next");
currentPage++;
}
await browser.close();
return organicResults;
}
getOrganicResults().then(console.log);
Code explanation
Declare constants from required libraries:
const puppeteer = require("puppeteer-extra");
const StealthPlugin = require("puppeteer-extra-plugin-stealth");
Code | Explanation |
---|---|
puppeteer |
Chromium control library |
StealthPlugin |
library for prevent website detection that you are using web driver |
Next, we "saying" to puppeteer
use StealthPlugin
:
puppeteer.use(StealthPlugin());
Next, we write variables with our search parameters:
const searchString = "playstation";
const pagesLimit = 10;
let currentPage = 1;
const URL = "https://www.ebay.com";
Code | Explanation |
---|---|
searchString |
what we want to search |
pagesLimit |
limit of pages for getting info |
currentPage |
current page of the search |
Next, we write down a function for getting information from page:
async function getPageResults(page) {
const pageResults = await page.evaluate(function () {
return Array.from(document.querySelectorAll("ul .s-item__wrapper")).map((el) => ({
link: el.querySelector(".s-item__link").getAttribute("href"),
title: el.querySelector(".s-item__title").textContent.trim(),
condition: el.querySelector(".SECONDARY_INFO")?.textContent.trim() || "No condition data",
price: el.querySelector(".s-item__price")?.textContent.trim() || "No price data",
shipping: el.querySelector(".s-item__shipping")?.textContent.trim() || "No shipping data",
thumbnail: el.querySelector(".s-item__image-img")?.getAttribute("src") || "No image",
}));
});
return pageResults;
}
Code | Explanation |
---|---|
pageResults |
an array with information about all goods from page |
page.evaluate(function () { |
is the Puppeteer method for injecting function in the page context and allows to return data directly from the browser |
document.querySelectorAll("ul .s-item__wrapper") |
returns a static NodeList representing a list of the document's elements that match the css selectors with class name s-item__wrapper which is any child of ul html element |
el.querySelector(".s-item__link") |
returns the first html element with class name s-item__link which is any child of the el html element |
.getAttribute("href") |
gets the href attribute value of the html element |
.trim() |
removes whitespace from both ends of a string |
And finally, a function to control the browser, change pages, and runs getPageResults
from each page:
async function getOrganicResults() {
const browser = await puppeteer.launch({
headless: false,
args: ["--no-sandbox", "--disable-setuid-sandbox"],
});
const page = await browser.newPage();
await page.setDefaultNavigationTimeout(60000);
await page.goto(URL);
await page.waitForSelector("#gh-ac");
await page.focus("#gh-ac");
await page.keyboard.type(searchString);
await page.waitForTimeout(1000);
await page.click("#gh-btn");
const organicResults = [];
while (true) {
await page.waitForSelector(".srp-results");
const isNextPage = await page.$(".pagination__next");
if (!isNextPage || currentPage > pagesLimit) break;
organicResults.push(...(await getPageResults(page)));
await page.click(".pagination__next");
currentPage++;
}
await browser.close();
return organicResults;
}
Code | Explanation |
---|---|
puppeteer.launch({options}) |
this method launches a new instance of the Chromium browser with current options
|
headless |
defines which mode to use: headless (by default) or non-headless |
args |
an array with arguments which is used with Chromium |
["--no-sandbox", "--disable-setuid-sandbox"] |
these arguments we use to allow the launch of the browser process in the online IDE |
browser.newPage() |
this method launches a new page |
page.setDefaultNavigationTimeout(60000) |
changing default (30 sec) time for waiting for selectors to 60000 ms (1 min) for slow internet connection |
page.goto(URL) |
navigation to URL which is defined above |
page.waitForSelector("#gh-ac") |
stops the script and waits for the html element with the #gh-ac selector to load |
page.focus("#gh-ac") |
focus on the html element with the #gh-ac selector |
page.keyboard.type(searchString) |
this method emulates keyboard input stored in searchString text |
page.waitForTimeout(1000) |
waiting 1000 ms before continue |
page.click("#gh-btn") |
this methods emulates mouse click on the html element with the #gh-btn selector |
const isNextPage = await page.$(".pagination__next") |
in this line of code, we find the html element with the .pagination__next selector and save it in isNextPage constant |
if (!isNextPage ┃┃ currentPage > pagesLimit) break |
in this line of code, we check that isNextPage is not equal to true (! character) and currentPage is greater than pagesLimit . And if the expression in brackets is true we run break which ends the while loop |
organicResults.push(...(await getPageResults(page))) |
in this code, we use spread syntax to split the array from result that was returned from getPageResults function into elements and add them in the end of organicResults array |
browser.close() |
after all we close the browser instance |
📌Note: in this case, we can skip
await page.waitForSelector("#gh-ac");
await page.focus("#gh-ac");
await page.keyboard.type(searchString);
await page.waitForTimeout(1000);
await page.click("#gh-btn");
and we can add search query directly in URL so there's no need to open the main page and then make the extra request to display product results. I write it because I wanted to show some helpful methods of puppeteer
. But you can remove this code, change the URL
constant to https://www.ebay.com/sch/i.html?_nkw=${searchString}
and write searchString
constant in URI encoding like const searchString = encodeURI("playstation");
.
Now we can launch our parser. To do this enter node YOUR_FILE_NAME
in your command line. Where YOUR_FILE_NAME
is the name of your .js
file.
Output
[
{
"link":"https://www.ebay.com/itm/324843125086?epid=4039276585&hash=item4ba228b95e:g:iygAAOSwJfFdty6D&amdata=enc%3AAQAHAAAA4BFods1e0MuxITheGVrICxhRAyEmOAtx6%2BT28euOBHuShbOsuqcQpMnhGTsxgz2hVdsgoUlbIYGe5nghV6CFsQVPDoRG%2FKhoBe2ilQMTPM%2BmcyGm8Qx%2B2DL%2BOg3UZAGCbXM0jikrzbg0zKp1PCYgcINmwxFqy7MaNP%2BnO2TMJEIK45mGxj6Ymsx2lDyuT84SMvBClalDYs9rJMZmrzQqIgyo0Kerk6Wk6F1l%2BBDyJ%2Blpe%2BmwoYqzu2FzQxAX3gAyRF0XSTyrDRu2IYXb0Kh89kbvtuq0KNyNG%2B2lZdO78M0R%7Ctkp%3ABFBMjMe8n6Vg",
"title":"Dusk Diver Day One Edition PlayStation 4, PS4 Brand New Factory Sealed",
"condition":"Brand New",
"price":"$37.49",
"shipping":"Free shipping",
"thumbnail":"https://i.ebayimg.com/thumbs/images/g/iygAAOSwJfFdty6D/s-l225.webp"
},
{
"link":"https://www.ebay.com/itm/265719188920?epid=110660824&hash=item3dde190db8:g:-oEAAOSwztpimIB~",
"title":"Sony PlayStation with xStation installed (PS1, NTSC SCPH-5501)",
"condition":"Pre-Owned",
"price":"$289.99",
"shipping":"+$13.15 shipping",
"thumbnail":"https://i.ebayimg.com/thumbs/images/g/-oEAAOSwztpimIB~/s-l225.webp"
},
...and other results
]
Ebay Search Engine Results API
Alternatively, you can use the Ebay Search Engine Results API from SerpApi. SerpApi is a free API with 100 searches per month. If you need more searches, there are paid plans.
The difference is that you don't have to use browser control solutions (like Puppeteer) which is much more time-consuming, you also don't have to look for the right selectors that can change over time, bypass blocking from robots and maintain a solution written from scratch. Instead, you will get ready-made structured JSON. Check out the playground.
First we need to install google-search-results-nodejs
. To do this you need to enter in your command line: npm i google-search-results-nodejs
const SerpApi = require("google-search-results-nodejs");
const search = new SerpApi.GoogleSearch(process.env.API_KEY); //your api key from serpapi.com
const searchString = "playstation"; // what we want to search
const pagesLimit = 10; // limit of pages for getting info
let currentPage = 1; // current page of the search
const params = {
engine: "ebay", // search engine
_nkw: searchString, // search query
ebay_domain: "ebay.com", // ebay domain of the search
_pgn: currentPage, // page of the search
};
const getOrganicResults = ({ organic_results }) => {
return organic_results.map((element) => {
const { link, title, condition = "No condition data", price = "No price data", shipping = "No shipping data", thumbnail = "No image" } = element;
return {
link,
title,
condition,
price: price && price.raw ? price.raw : `${price.from?.raw} - ${price.to?.raw}`,
shipping,
thumbnail,
};
});
};
const getJson = (params) => {
return new Promise((resolve) => {
search.json(params, resolve);
});
};
const getResults = async () => {
const organicResults = [];
while (true) {
if (currentPage > pagesLimit) break;
const json = await getJson(params);
if (json.search_information?.organic_results_state === "Fully empty") break;
organicResults.push(...(await getOrganicResults(json)));
currentPage++;
}
return organicResults;
};
getResults().then(console.log)
Code explanation
Declare constants from required libraries:
const SerpApi = require("google-search-results-nodejs");
const search = new SerpApi.GoogleSearch(API_KEY);
Code | Explanation |
---|---|
SerpApi |
SerpApi Node.js library |
search |
new instance of GoogleSearch class |
API_KEY |
your API key from SerpApi |
Next, we write down what we want to search and the necessary parameters for making a request:
const searchString = "playstation";
const pagesLimit = 10;
let currentPage = 1;
const params = {
engine: "ebay",
_nkw: searchString,
ebay_domain: "ebay.com",
_pgn: currentPage,
};
Code | Explanation |
---|---|
searchString |
what we want to search |
pagesLimit |
limit of pages for getting info |
currentPage |
current page of the search |
engine |
search engine |
_nkw |
search query |
ebay_domain |
ebay domain: ebay.com, ebay.de, ebay.co.uk |
_pgn |
current page |
Next, we write a callback function in which we describe what data we need from the result of our request:
const getOrganicResults = ({ organic_results }) => {
return organic_results.map((element) => {
const { link, title, condition = "No condition data", price = "No price data", shipping = "No shipping data", thumbnail = "No image" } = element;
return {
link,
title,
condition,
price: price && price.raw ? price.raw : `${price.from?.raw} - ${price.to?.raw}`,
shipping,
thumbnail,
};
});
};
Code | Explanation |
---|---|
organic_results |
an array that we destructured from response |
link, title, condition, price, shipping, thumbnail |
other data that we destructured from element of news_results array |
thumbnail = "No image" |
we set default value No image if thumbnail is undefined
|
price: price && price.raw ? price.raw : '${price.from?.raw} - ${price.to?.raw}' |
in this line we use ternary operator to set vailid price. If we can get price and data with raw key we set it to our price , otherwise in price we set price.from and price.to
|
Next, we wrap the search method from the SerpApi library in a promise to further work with the search results:
const getJson = (params) => {
return new Promise((resolve) => {
search.json(params, resolve);
})
}
getJson(params).then(getKnowledgeGraph).then(console.log)
And finally, we declare and run the function getResult
that gets info from all pages between currentPage
and pagesLimit
and return it.
const getResults = async () => {
const organicResults = [];
while (true) {
if (currentPage > pagesLimit) break;
const json = await getJson(params);
if (json.search_information?.organic_results_state === "Fully empty") break;
organicResults.push(...(await getOrganicResults(json)));
currentPage++;
}
return organicResults;
};
getResults().then(console.log)
Code | Explanation |
---|---|
organicResults.push(...(await getOrganicResults(json))) |
in this code, we use spread syntax to split the array from result that was returned from getOrganicResults function into elements and add them in the end of organicResults array |
Output
[
{
"link":"https://www.ebay.com/itm/324950767168?hash=item4ba8933640:g:yQsAAOSwU8phwB9l",
"title":"Sony PlayStation PS Vita OLED (PCH-1001) Firmware FW 3.60, 128GB - Ship in 1-DAY",
"condition":"Open Box",
"price":"$179.95",
"shipping":"Free shipping",
"thumbnail":"https://i.ebayimg.com/thumbs/images/g/yQsAAOSwU8phwB9l/s-l225.jpg"
},
{
"link":"https://www.ebay.com/itm/393419045168?hash=item5b999a3930:g:NzYAAOSwBPNiBoAk",
"title":"PS4 PlayStation 4 Sony Original Slim Pro 500GB 1TB 2TB Console Used Ship first",
"condition":"Pre-Owned",
"price":"$259.80 - $484.99",
"shipping":"Free shipping",
"thumbnail":"https://i.ebayimg.com/thumbs/images/g/NzYAAOSwBPNiBoAk/s-l225.jpg"
},
...and other results
]
Links
- Code in the online IDE
- Ebay Search Engine Results API
- Ebay Search Engine Results in SerpApi Playgrond
If you want to see some projects made with SerpApi, please write me a message.
Add a Feature Request💫 or a Bug🐞
Top comments (1)
This is very well written, thanks for it