Mikhail Zub for SerpApi

Posted on Aug 2, 2022

Web Scraping Google Maps Reviews with Nodejs

#webscraping #node #serpapi #maps

What will be scraped

Preparation

First, we need to create a Node.js* project and add npm packages puppeteer, puppeteer-extra and puppeteer-extra-plugin-stealth to control Chromium (or Chrome, or Firefox, but now we work only with Chromium which is used by default) over the DevTools Protocol in headless or non-headless mode.

To do this, in the directory with our project, open the command line and enter npm init -y, and then npm i puppeteer puppeteer-extra puppeteer-extra-plugin-stealth.

*If you don't have Node.js installed, you can download it from nodejs.org and follow the installation documentation.

📌Note: also, you can use puppeteer without any extensions, but I strongly recommended use it with puppeteer-extra with puppeteer-extra-plugin-stealth to prevent website detection that you are using headless Chromium or that you are using web driver. You can check it on Chrome headless tests website. The screenshot below shows you a difference.

Process

SelectorGadget Chrome extension was used to grab CSS selectors by clicking on the desired element in the browser. If you have any struggles understanding this, we have a dedicated Web Scraping with CSS Selectors blog post at SerpApi.

The Gif below illustrates the approach of selecting different parts of the results.

Full code

📌Note: to get a place URL you may use the tutorial from my blog post Web Scraping Google Maps Places with Nodejs.

const puppeteer = require("puppeteer-extra");
const StealthPlugin = require("puppeteer-extra-plugin-stealth");

puppeteer.use(StealthPlugin());

const placeUrl =
  "https://www.google.com/maps/place/Starbucks/data=!4m7!3m6!1s0x549069a98254bd17:0xb2f64f75b3edf4c3!8m2!3d47.5319688!4d-122.1942498!16s%2Fg%2F1tdfmzpb!19sChIJF71UgqlpkFQRw_Tts3VP9rI?authuser=0&hl=en&rclk=1";

async function scrollPage(page, scrollContainer) {
  let lastHeight = await page.evaluate(`document.querySelector("${scrollContainer}").scrollHeight`);
  while (true) {
    await page.evaluate(`document.querySelector("${scrollContainer}").scrollTo(0, document.querySelector("${scrollContainer}").scrollHeight)`);
    await page.waitForTimeout(2000);
    let newHeight = await page.evaluate(`document.querySelector("${scrollContainer}").scrollHeight`);
    if (newHeight === lastHeight) {
      break;
    }
    lastHeight = newHeight;
  }
}

async function getReviewsFromPage(page) {
  const reviews = await page.evaluate(() => {
    return Array.from(document.querySelectorAll(".jftiEf")).map((el) => {
      return {
        user: {
          name: el.querySelector(".d4r55")?.textContent.trim(),
          link: el.querySelector(".WNxzHc a")?.getAttribute("href"),
          thumbnail: el.querySelector(".NBa7we")?.getAttribute("src"),
          localGuide: el.querySelector(".RfnDt span:first-child")?.style.display === "none" ? undefined : true,
          reviews: parseInt(el.querySelector(".RfnDt span:last-child")?.textContent.replace("·", "")),
        },
        rating: parseFloat(el.querySelector(".kvMYJc")?.getAttribute("aria-label")),
        date: el.querySelector(".rsqaWe")?.textContent.trim(),
        snippet: el.querySelector(".MyEned")?.textContent.trim(),
        likes: parseFloat(el.querySelector(".GBkF3d:nth-child(2)")?.getAttribute("aria-label")),
        images: Array.from(el.querySelectorAll(".KtCyie button")).length
          ? Array.from(el.querySelectorAll(".KtCyie button")).map((el) => {
              return {
                thumbnail: getComputedStyle(el).backgroundImage.slice(5, -2),
              };
            })
          : undefined,
        date: el.querySelector(".rsqaWe")?.textContent.trim(),
      };
    });
  });
  return reviews;
}

async function fillPlaceInfo(page) {
  const placeInfo = await page.evaluate(() => {
    return {
      title: document.querySelector(".DUwDvf").textContent.trim(),
      address: document.querySelector("button[data-item-id='address']")?.textContent.trim(), // data-item-id attribute may be different if the language is not English
      rating: document.querySelector("div.F7nice").textContent.trim(),
      reviews: document.querySelector("span.F7nice").textContent.trim().split(" ")[0],
    };
  });
  return placeInfo;
}

async function getLocalPlaceReviews() {
  const browser = await puppeteer.launch({
    headless: false,
    args: ["--no-sandbox", "--disable-setuid-sandbox"],
  });

  const page = await browser.newPage();

  await page.setDefaultNavigationTimeout(60000);
  await page.goto(placeUrl);
  await page.waitForSelector(".DUwDvf");

  const placeInfo = await fillPlaceInfo(page);

  await page.click(".mgr77e .DkEaL");
  await page.waitForTimeout(2000);
  await page.waitForSelector(".jftiEf");

  await scrollPage(page, '.DxyBCb');

  const reviews = await getReviewsFromPage(page);

  await browser.close();

  return { placeInfo, reviews };
}

getLocalPlaceReviews().then((result) => console.dir(result, { depth: null }));

Code explanation

Declare constants from required libraries:

const puppeteer = require("puppeteer-extra");
const StealthPlugin = require("puppeteer-extra-plugin-stealth");

Code	Explanation
`puppeteer`	Chromium control library
`StealthPlugin`	library for prevent website detection that you are using web driver

Next, we "say" to puppeteer use StealthPlugin and write place URL:

puppeteer.use(StealthPlugin());

const placeUrl =
  "https://www.google.com/maps/place/Starbucks/data=!4m7!3m6!1s0x549069a98254bd17:0xb2f64f75b3edf4c3!8m2!3d47.5319688!4d-122.1942498!16s%2Fg%2F1tdfmzpb!19sChIJF71UgqlpkFQRw_Tts3VP9rI?authuser=0&hl=en&rclk=1";

Next, we write down a function for scrolling reviews container on the page:

async function scrollPage(page, scrollContainer) {
  let lastHeight = await page.evaluate(`document.querySelector("${scrollContainer}").scrollHeight`);
  while (true) {
    await page.evaluate(`document.querySelector("${scrollContainer}").scrollTo(0, document.querySelector("${scrollContainer}").scrollHeight)`);
    await page.waitForTimeout(2000);
    let newHeight = await page.evaluate(`document.querySelector("${scrollContainer}").scrollHeight`);
    if (newHeight === lastHeight) {
      break;
    }
    lastHeight = newHeight;
  }
}

Code	Explanation
`lastHeight`	current scrollheight of the container
`page.evaluate('document.querySelector...`	runs code from the brackets in the browser console and returns the result
`page.waitForTimeout(2000)`	waiting 2000 ms before continue
`newHeight`	scrollheight of the container after scroll

Next, we write down a function for getting reviews from the page:

async function getReviewsFromPage(page) {
  const reviews = await page.evaluate(() => {
    return Array.from(document.querySelectorAll(".jftiEf")).map((el) => {
      return {
        user: {
          name: el.querySelector(".d4r55")?.textContent.trim(),
          link: el.querySelector(".WNxzHc a")?.getAttribute("href"),
          thumbnail: el.querySelector(".NBa7we")?.getAttribute("src"),
          localGuide: el.querySelector(".RfnDt span:first-child")?.style.display === "none" ? undefined : true,
          reviews: parseInt(el.querySelector(".RfnDt span:last-child")?.textContent.replace("·", "")),
        },
        rating: parseFloat(el.querySelector(".kvMYJc")?.getAttribute("aria-label")),
        date: el.querySelector(".rsqaWe")?.textContent.trim(),
        snippet: el.querySelector(".MyEned")?.textContent.trim(),
        likes: parseFloat(el.querySelector(".GBkF3d:nth-child(2)")?.getAttribute("aria-label")),
        images: Array.from(el.querySelectorAll(".KtCyie button")).length
          ? Array.from(el.querySelectorAll(".KtCyie button")).map((el) => {
              return {
                thumbnail: getComputedStyle(el).backgroundImage.slice(5, -2),
              };
            })
          : undefined,
        date: el.querySelector(".rsqaWe")?.textContent.trim(),
      };
    });
  });
  return reviews;
}

Code	Explanation
`document.querySelectorAll(".jftiEf")`	returns a static NodeList representing a list of the document's elements that match the css selectors with class name `jftiEf`
`el.querySelector(".d4r55")`	returns the first html element with selector `.d4r55` which is any child of the `el` html element
`.textContent`	gets the raw text of html element
`.trim()`	removes whitespace from both ends of a string
`.getAttribute("href")`	gets the `href` attribute value of the html element
`getComputedStyle(el).backgroundImage`	`getComputedStyle(el)` returns an object containing the values of all CSS properties of an `el`, after applying active stylesheets, and get `backgroundImage` property
`.slice(5, -2)`	this method keeps everything from the 5th character from the beginning to the 2nd (inclusive) character from the end and removes the others

Next, we write down a function for getting main place info from the page:

async function fillPlaceInfo(page) {
  const placeInfo = await page.evaluate(() => {
    return {
      title: document.querySelector(".DUwDvf").textContent.trim(),
      address: document.querySelector("button[data-item-id='address']")?.textContent.trim(),
      rating: document.querySelector("div.F7nice").textContent.trim(),
      reviews: document.querySelector("span.F7nice").textContent.trim().split(" ")[0],
    };
  });
  return placeInfo;
}

And finally, a function to control the browser, and get information:

async function getLocalPlaceReviews() {
  const browser = await puppeteer.launch({
    headless: false,
    args: ["--no-sandbox", "--disable-setuid-sandbox"],
  });

  const page = await browser.newPage();

  await page.setDefaultNavigationTimeout(60000);
  await page.goto(placeUrl);
  await page.waitForSelector(".DUwDvf");

  const placeInfo = await fillPlaceInfo(page);

  await page.click(".mgr77e .DkEaL");
  await page.waitForTimeout(2000);
  await page.waitForSelector(".jftiEf");

  await scrollPage(page, '.DxyBCb');

  const reviews = await getReviewsFromPage(page);

  await browser.close();

  return { placeInfo, reviews };
}

getLocalPlaceReviews().then((result) => console.dir(result, { depth: null }));

Code	Explanation
`puppeteer.launch({options})`	this method launches a new instance of the Chromium browser with current `options`
`headless`	defines which mode to use: headless (by default) or non-headless
`args`	an array with arguments which is used with Chromium
`["--no-sandbox", "--disable-setuid-sandbox"]`	these arguments we use to allow the launch of the browser process in the online IDE
`browser.newPage()`	this method launches a new page
`page.setDefaultNavigationTimeout(60000)`	changing default (30 sec) time for waiting for selectors to 60000 ms (1 min) for slow internet connection
`page.goto(URL)`	navigation to `URL` which is defined above
`page.click(".Dx2nRe")`	this methods emulates mouse click on the html element with the `.Dx2nRe` selector
`browser.close()`	after all we close the browser instance
`console.dir(result, { depth: null })`	console method `dir` allows you to use an object with the necessary parameters to change default output options. Watch Node.js documentation for more info

Now we can launch our parser. To do this enter node YOUR_FILE_NAME in your command line. Where YOUR_FILE_NAME is the name of your .js file.

Output

{
   "placeInfo":{
      "title":"Starbucks",
      "address":"1785 NE 44th St, Renton, WA 98056, United States",
      "rating":"4.1",
      "reviews":"381"
   },
   "reviews":[
      {
         "user":{
            "name":"Bo Wagner",
            "link":"https://www.google.com/maps/contrib/118325097789436047813/reviews?hl=en-US",
            "thumbnail":"https://lh3.googleusercontent.com/a/AItbvmlPWzfGuqAk1v2yewzIizLcl462BenzGnCadQWt=w36-h36-p-c0x00000000-rp-mo-ba6-br100",
            "localGuide":true,
            "reviews":442
         },
         "rating":4,
         "date":"5 months ago",
         "snippet":"Good service, but waiting a bit long for my drink.  Look like a trainee was making my drink. It taste different.",
         "likes":1,
         "images":[
            {
               "thumbnail":"https://lh5.googleusercontent.com/p/AF1QipNIUP-aOWRElmfVOjnf5lJJYFiLKBaSx7MSkhg8=w300-h450-p-k-no"
            },
            {
               "thumbnail":"https://lh5.googleusercontent.com/p/AF1QipPcTFJIW9JAZxZ0PU0WC2U5rPnESv7OnrnSANwV=w300-h225-p-k-no"
            },
            {
               "thumbnail":"https://lh5.googleusercontent.com/p/AF1QipN_LkT7MCwx-oaf1yXkMnc_D-gm6HrWa7Kqoep8=w300-h225-p-k-no"
            }
         ]
      },
      {
         "user":{
            "name":"Azurina S (Zeze)",
            "link":"https://www.google.com/maps/contrib/108701024889578509779/reviews?hl=en-US",
            "thumbnail":"https://lh3.googleusercontent.com/a-/AFdZucqQsjYaAOuvBT8dMBe_BeywrjLtshpgCL3xZGp5mg=w36-h36-p-c0x00000000-rp-mo-br100",
            "reviews":7
         },
         "rating":5,
         "date":"4 months ago",
         "snippet":"Super friendly and fast.  They were getting through that Drive-Thru line at record speed!! Thank you for that because I was in a serious rush!! 👍🏽",
         "likes":1,
         "images":[
            {
               "thumbnail":"https://lh5.googleusercontent.com/p/AF1QipPrI2xvgjFNh2vxFmBxRJBYvw553mORZdRZYwdZ=w300-h450-p-k-no"
            },
            {
               "thumbnail":"https://lh5.googleusercontent.com/p/AF1QipPVZ4YJqXjLvL-XTFBpB0oo4lVaBdrAGv2Ohyux=w300-h450-p-k-no"
            }
         ]
      },
      ...and other reviews
   ]
}

Google Maps Reviews API

Alternatively, you can use the Google Maps Reviews API from SerpApi. SerpApi is a free API with 100 searches per month. If you need more searches, there are paid plans.

The difference is that you won't have to write code from scratch and maintain it. You may also experience blocking from Google and changing selectors which will break the parser. Instead, you just need to iterate the structured JSON and get the data you want. Check out the playground.

First, we need to install google-search-results-nodejs. To do this you need to enter in your console: npm i google-search-results-nodejs

📌Note: To make our search we need the data_id parameter. You can take it using the guide from my blog post Web Scraping Google Maps Places with Nodejs.

const SerpApi = require("google-search-results-nodejs");
const search = new SerpApi.GoogleSearch(process.env.API_KEY);     //your API key from serpapi.com

const dataId = "0x549069a98254bd17:0xb2f64f75b3edf4c3";                    // data ID parameter

const params = {
  engine: "google_maps_reviews",                                           // search engine
  hl: "en",                                                                // parameter defines the language to use for the Google search
  data_id: dataId,                                                         // parameter defines the Google Maps data ID
};

const getJson = () => {
  return new Promise((resolve) => {
    search.json(params, resolve);
  });
};

exports.getResults = async () => {
  const allReviews = {
    reviews: [],
  };
  while (true) {
    const json = await getJson();
    if (!allReviews.placeInfo) allReviews.placeInfo = json.place_info;
    if (json.reviews) {
      allReviews.reviews.push(...json.reviews);
    } else break;
    if (json.serpapi_pagination?.next_page_token) {
      params.next_page_token = json.serpapi_pagination?.next_page_token;
    } else break;
  }
  return allReviews;
};

getResults.then((result) => console.dir(result, { depth: null }));

Code explanation

Declare constants from required libraries:

const SerpApi = require("google-search-results-nodejs");
const search = new SerpApi.GoogleSearch(API_KEY);

Code	Explanation
`SerpApi`	SerpApi Node.js library
`search`	new instance of GoogleSearch class
`API_KEY`	your API key from SerpApi

Next, we write down what we want to search and the necessary parameters for making a request:

const dataId = "0x549069a98254bd17:0xb2f64f75b3edf4c3";

const params = {
  engine: "google_maps_reviews",
  hl: "en",
  data_id: dataId,
};

Code	Explanation
`dataId`	data ID parameter
`engine`	search engine
`hl`	parameter defines the language to use for the Google Scholar search

Next, we wrap the search method from the SerpApi library in a promise to further work with the search results:

const getJson = () => {
  return new Promise((resolve) => {
    search.json(params, resolve);
  })
}

And finally, we declare and run the function getResult that gets reviews from all pages and return it:

const getResults = async () => {
  const allReviews = {
    reviews: [],
  };
  while (true) {
    const json = await getJson();
    if (!allReviews.placeInfo) allReviews.placeInfo = json.place_info;
    if (json.reviews) {
      allReviews.reviews.push(...json.reviews);
    } else break;
    if (json.serpapi_pagination?.next_page_token) {
      params.next_page_token = json.serpapi_pagination?.next_page_token;
    } else break;
  }
  return allReviews;
};

getResults().then((result) => console.dir(result, { depth: null }))

Code	Explanation
`allReviews`	an object with main place info and reviews from all pages
`allReviews.reviews.push(...json.reviews)`	in this code, we use spread syntax to split the `photos` array from result that was returned from `reviews` function into elements and add them in the end of `allReviews.reviews` array
`console.dir(result, { depth: null })`	console method `dir` allows you to use an object with the necessary parameters to change default output options. Watch Node.js documentation for more info

Output

{
   "reviews":[
      {
         "user":{
            "name":"Bo Wagner",
            "link":"https://www.google.com/maps/contrib/118325097789436047813?hl=en-US&sa=X&ved=2ahUKEwiEpJXYzoz5AhXDVDUKHbpYCAwQvvQBegQIARBB",
            "thumbnail":"https://lh3.googleusercontent.com/a/AItbvmlPWzfGuqAk1v2yewzIizLcl462BenzGnCadQWt=s40-c-c0x00000000-cc-rp-mo-ba6-br100",
            "local_guide":true,
            "reviews":442,
            "photos":4747
         },
         "rating":4,
         "date":"5 months ago",
         "snippet":"Good service, but waiting a bit long for my drink. Look like a trainee was making my drink. It taste different.",
         "likes":1,
         "images":[
            "https://lh5.googleusercontent.com/p/AF1QipNIUP-aOWRElmfVOjnf5lJJYFiLKBaSx7MSkhg8=w100-h100-p-n-k-no",
            "https://lh5.googleusercontent.com/p/AF1QipPcTFJIW9JAZxZ0PU0WC2U5rPnESv7OnrnSANwV=w100-h100-p-n-k-no",
            "https://lh5.googleusercontent.com/p/AF1QipN_LkT7MCwx-oaf1yXkMnc_D-gm6HrWa7Kqoep8=w100-h100-p-n-k-no"
         ]
      },
      {
         "user":{
            "name":"Azurina S (Zeze)",
            "link":"https://www.google.com/maps/contrib/108701024889578509779?hl=en-US&sa=X&ved=2ahUKEwiEpJXYzoz5AhXDVDUKHbpYCAwQvvQBegQIARBb",
            "thumbnail":"https://lh3.googleusercontent.com/a-/AFdZucqQsjYaAOuvBT8dMBe_BeywrjLtshpgCL3xZGp5mg=s40-c-c0x00000000-cc-rp-mo-br100",
            "reviews":7,
            "photos":2
         },
         "rating":5,
         "date":"4 months ago",
         "snippet":"Super friendly and fast. They were getting through that Drive-Thru line at record speed!! Thank you for that because I was in a serious rush!! 👍🏽",
         "likes":1,
         "images":[
            "https://lh5.googleusercontent.com/p/AF1QipPrI2xvgjFNh2vxFmBxRJBYvw553mORZdRZYwdZ=w100-h100-p-n-k-no",
            "https://lh5.googleusercontent.com/p/AF1QipPVZ4YJqXjLvL-XTFBpB0oo4lVaBdrAGv2Ohyux=w100-h100-p-n-k-no"
         ]
      },
      ...and other reviews
   ],
   "placeInfo":{
      "title":"Starbucks",
      "address":"1785 NE 44th St, Renton, WA",
      "rating":4.1,
      "reviews":381
   }
}

Links

If you want to see some projects made with SerpApi, please write me a message.

Join us on Twitter | YouTube

Add a Feature Request💫 or a Bug🐞

DEV Community

Web Scraping Google Maps Reviews with Nodejs

What will be scraped

Preparation

Process

Full code

Code explanation

Output

Google Maps Reviews API

Code explanation

Output

Links

Top comments (0)

Read next

The Right Way to do Authentication in Node.js [November 2024]

Ensuring Data Integrity in Financial Transactions: The PostgreSQL Transaction Solution

🛠️ Battle of the Backend: Go vs Node.js vs Python – Which One Reigns Supreme in 2024? 🚀

npm vs npx: Choosing the Right Tool for the Job