Web scraping with Javascript rendering
Getting started
Simply send a GET request to http://chrome.browsercloud.io with two query string parameters and the API will return the HTML response:
- Curl
- Python
- NodeJS
- PHP
- Go
- Java
curl 'https://chrome.browsercloud.io/content?token=API_TOKEN&url=https://site.com'
import requests
params = {
'token': 'API_TOKEN',
'url': 'https://browsercloud.io/doc-examples/content.html',
}
res = requests.get('https://chrome.browsercloud.io/content', params)
if res:
print('Response OK')
print(res.text)
else:
print('Response Failed')
//npm i axios
const axios = require('axios');
doRequest({
token: 'API_TOKEN', //required
url: 'https://browsercloud.io/doc-examples/content.html' //required
});
async function doRequest(params) {
try {
let res = await axios.get('https://chrome.browsercloud.io/content', {params : params});
console.log(res.data);
} catch (error) {
console.log(error);
}
}
<?php
echo doRequest([
'token' => 'API_TOKEN',
'url' => 'https://browsercloud.io/doc-examples/content.html'
]);
function doRequest($params) {
$url = 'https://chrome.browsercloud.io/content?'. http_build_query($params);
$ch = curl_init();
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_URL, $url);
$response = curl_exec($ch);
if (!$response) {
echo "Error: ". curl_error($ch). "; Code: ". curl_errno($ch);
}
return $response;
}
package main
import (
"fmt"
"io/ioutil"
"net/http"
"net/url"
)
func main() {
res, err := doRequest(url.Values{
"token": {"API_TOKEN"}, // required
"url": {"https://browsercloud.io/doc-examples/content.html"}, // required
})
if err != nil {
fmt.Printf("Error: %v", err)
}
fmt.Printf("response: %s", res)
}
func doRequest(params url.Values) ([]byte, error) {
req, err := http.NewRequest("GET", "https://chrome.browsercloud.io/content", nil)
req.URL.RawQuery = params.Encode()
res, err := http.DefaultClient.Do(req)
if err != nil {
return nil, err
}
body, err := ioutil.ReadAll(res.Body)
if err != nil {
return nil, err
}
return body, nil
}
import java.net.URI;
import java.net.http.*;
public class Main {
public static void main(String[] args) {
sendRequest(
"https://chrome.browsercloud.io/content?"+
"token=API_TOKEN"+
"&url=https://browsercloud.io/doc-examples/content.html"
);
}
private static void sendRequest(String url) {
try {
var client = HttpClient.newHttpClient();
var request = HttpRequest.newBuilder(
URI.create(url))
.build();
var response = client.send(request, HttpResponse.BodyHandlers.ofString());
System.out.println(response.body());
}
catch (Exception e) { System.out.println(e); }
}
}
Parameter | Available values | Description |
---|---|---|
url | string, required. Example : https://browsercloud.io | site URL you would like to scrape |
token | string, required |
Javascript rendering
Javascript rendering is enabled by default, you can use static web-crawler with disabled JS rendering using render
parameter
- Curl
- Python
- NodeJS
- PHP
- Go
- Java
curl 'https://chrome.browsercloud.io/content?token=API_TOKEN&url=https://site.com&render=false'
import requests
params = {
'token': 'API_TOKEN',
'url': 'https://browsercloud.io/doc-examples/content.html',
'render': 'true'
}
res = requests.get('https://chrome.browsercloud.io/content', params)
if res:
print('Response OK')
print(res.text)
else:
print('Response Failed')
//npm i axios
const axios = require('axios');
doRequest({
token: 'API_TOKEN', //required
url: 'https://browsercloud.io/doc-examples/content.html', //required
render: true
});
async function doRequest(params) {
try {
let res = await axios.get('https://chrome.browsercloud.io/content', {params : params});
console.log(res.data);
} catch (error) {
console.log(error);
}
}
<?php
echo doRequest([
'token' => 'API_TOKEN',
'url' => 'https://browsercloud.io/doc-examples/content.html',
'render' => 'true',
]);
function doRequest($params) {
$url = 'https://chrome.browsercloud.io/content?'. http_build_query($params);
$ch = curl_init();
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_URL, $url);
$response = curl_exec($ch);
if (!$response) {
echo "Error: ". curl_error($ch). "; Code: ". curl_errno($ch);
}
return $response;
}
package main
import (
"fmt"
"io/ioutil"
"net/http"
"net/url"
)
func main() {
res, err := doRequest(url.Values{
"token": {"API_TOKEN"}, // required
"url": {"https://browsercloud.io/doc-examples/content.html"}, // required
"render": {"true"},
})
if err != nil {
fmt.Printf("Error: %v", err)
}
fmt.Printf("response: %s", res)
}
func doRequest(params url.Values) ([]byte, error) {
req, err := http.NewRequest("GET", "https://chrome.browsercloud.io/content", nil)
req.URL.RawQuery = params.Encode()
res, err := http.DefaultClient.Do(req)
if err != nil {
return nil, err
}
body, err := ioutil.ReadAll(res.Body)
if err != nil {
return nil, err
}
return body, nil
}
import java.net.URI;
import java.net.http.*;
public class Main {
public static void main(String[] args) {
sendRequest(
"https://chrome.browsercloud.io/content?"+
"token=API_TOKEN"+
"&url=https://browsercloud.io/doc-examples/content.html"+
"&render=true"
);
}
private static void sendRequest(String url) {
try {
var client = HttpClient.newHttpClient();
var request = HttpRequest.newBuilder(
URI.create(url))
.build();
var response = client.send(request, HttpResponse.BodyHandlers.ofString());
System.out.println(response.body());
}
catch (Exception e) { System.out.println(e); }
}
}
Parameter | Available values | Description |
---|---|---|
render | false / true (enabled by default, you can omit this param) | Disable / Enable JS rendering by real browser |
Device
Our web crawler takes a unique UserAgent string to each request to avoid getting blocked. We use more than 1000 of the most popular desktop and mobile devices
curl 'https://chrome.browsercloud.io/content?token=API_TOKEN&url=https://site.com&device=mobile'
Parameter | Available values | Description |
---|---|---|
device | mobile / desktop (enabled by default, you can omit this param) | Choose the device type for UserAgent rotation in the request |
Proxies & GEO targeting
Our standard proxy pools include millions of proxies from over dozens of ISPs and should be sufficient for the vast majority of scraping jobs.
You can also use geo-targeting by setting country
parameter. Or omit parameter for global proxy rotation
curl 'https://chrome.browsercloud.io/content?token=API_TOKEN&url=https://site.com&country=US'
Parameter | Available values | Description |
---|---|---|
proxy | 1) omit parameter to use common pool with 1.5+ million proxies 2) false - disable proxies ( for example when you need just JS rendering )3) premium - premium proxy pool for a few particularly difficult to scrape sites | |
country | parameter works with 'common proxy pool' 1) omit parameter or set ALL for global rotating2) two-letter country ISO code. Example: US , CA , GB , DE and more3) EU proxy rotation over EU countries | Proxy geo targeting |
Residential & Mobile Proxies
Our standard proxy pools include millions of proxies from over a dozen ISPs and should be sufficient for the vast majority of scraping jobs. However, for a few particularly difficult to scrape sites, we also maintain a private internal pool of residential and mobile IPs. This pool is only available by request.
Wait for Element when rendering
If a rendered request is a bit slow and the page stabilizes before the request is satisfied, it can fool the API into thinking the page has finished rendering.
To cope with this, you can tell the API to wait for a dom element (selector) to appear on the page when rendering. You just need to send the wait-for
parameter, passing a URL-encoded jQuery selector.
The API will then wait for this to appear on the page before returning results.
curl 'https://chrome.browsercloud.io/content?token=API_TOKEN&url=https://site.com&wait-for=%23ajax-content'
Parameter | Available values | Description |
---|---|---|
wait-for | string, Example: %23ajax-content (%23 is # symbol) | URL-encoded selector. Requires JS rendering |
Javascript execution
You can pass your custom Javascript code to run in the browser context using js_snippet param, and it will be executed after the page load will finish.
Custom javascript can be used for interaction with a page, like scrolling, pressing a button, etc.
curl -X POST \
https://chrome.browsercloud.io/content?token=API_TOKEN \
-H 'Cache-Control: no-cache' \
-H 'Content-Type: application/json' \
-d '
{
"url": "https://browsercloud.io/doc-examples/content.html",
"addScriptTag" : [
{
"content" : "let node = document.querySelector(\"#header-2\"); node.textContent = \"My custom JS did it!\""
}
]
}'
Additional parameters
Example with all available JSON options
curl -X POST \
https://chrome.browsercloud.io/content?token=API_TOKEN \
-H 'Cache-Control: no-cache' \
-H 'Content-Type: application/json' \
-d '
{
"url": "https://browsercloud.io/doc-examples/content.html",
"addScriptTag" : [
{
"content" : "let node = document.querySelector(\"#header-2\"); node.textContent = \"My custom JS did it!\""
}
],
"setJavaScriptEnabled" : true,
"waitFor" : "#delayed",
"userAgent" : "Mozilla/5.0 (iPhone; CPU iPhone OS 13_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.2 Mobile/15E148 Safari/604.1",
"rejectResourceTypes" : ["image"],
"authenticate" : {
"username" : "test",
"password" : "test"
},
"cookies" : [
{
"name" : "session",
"value" : "session-value",
"domain" : "browsercloud.io"
}
]
}'
Parameters | Available values | Description |
---|---|---|
url | string | URL for web scraping |
setJavaScriptEnabled | true(default), false - javascript rendering | Javascript rendering |
waitFor | string | Script waits certain DOM element to be rendered |
addScriptTag.content | string - js code | Adds custom <script> tag to the page |
userAgent | string | sets custom UserAgent for a web scraper |
rejectResourceTypes | string: 'document','stylesheet','image','media', 'font','script','texttrack','xhr', 'fetch','eventsource','websocket','manifest','other' | Blocks unnecessary resource type to boost page load |
authenticate | username, password: string | Basic auth |
cookies | string | Custom cookies (for example: auth session) |