Skip to content

Web/Data Scraping & Automation

Best Practices: - Use $net().get(url) for HTTP requests; always check .get("status") for HTTP status. - Parse HTML/XML with $XML().parse(html) or .html(). - Use .select() or .findall() to extract elements or attributes. - Rate limit requests with $sys().sleep(ms) to avoid overloading servers. - Use public, stable endpoints (e.g., https://httpbin.org/html, https://example.com) in examples. - Always check for $ERR when accessing attributes or elements that may not exist. - See Advanced Extraction Patterns for more on .findall() and complex queries.

Example: Web Scraper with Rate Limiting

/* Define a function to scrape multiple pages with rate limiting */
scrape_pages = op(urls, delay_ms) {
    /* Map over each URL, with index for delay control */
    results = urls.map(op(url, index) {
        /* Add delay between requests except for the first */
        if (index > 0) {
            $sys().sleep(delay_ms);
        };

        try {
            /* Perform HTTP GET request */
            response = $net().get(url);
            /* Check for successful response */
            if (response.get("status") == 200) {
                {
                    "url": url,
                    "success": true,
                    "content": response.get("body"),
                    "size": response.get("body").len()
                };
            } else {
                {"url": url, "success": false, "error": "HTTP " + response.get("status").str()};
            };
        } catch (error) {
            /* Handle network or HTTP errors */
            {"url": url, "success": false, "error": error.get("message")};
        };
    });
    results;
};

/* Define a function to extract data from HTML content */
extract_data = op(html_content) {
    /* Parse HTML and extract specific elements */
    doc = $XML().parse(html_content);
    titles = doc.select("h1, h2, h3");
    links = doc.select("a[href]");

    {
        "titles": titles.map(op(t) { t.get("text"); }),
        "links": links.map(op(l) { l.get("href"); })
    };
};

/* Example usage: list of target URLs to scrape */
target_urls = [
    "https://example.com/page1",
    "https://example.com/page2",
    "https://example.com/page3"
];

/* Scrape the pages with a 1 second delay between requests */
scraped_data = scrape_pages(target_urls, 1000);

/* Extract data from each successfully scraped page */
extracted_data = scraped_data.map(op(page) { 
    if (page.get("success")) {
        extract_data(page.get("content"));
    } else {
        {"error": page.get("error")};
    };
});

Example: API Data Collection

/* Collect data from REST API endpoints */
collect_api_data = op(base_url, endpoints) {
    results = endpoints.map(op(endpoint) {
        try {
            url = base_url + endpoint;
            response = $net().get(url);

            if (response.get("status") == 200) {
                /* Parse JSON response */
                data = $JSON().parse(response.get("body"));
                {"endpoint": endpoint, "success": true, "data": data};
            } else {
                {"endpoint": endpoint, "success": false, "error": "HTTP " + response.get("status").str()};
            };
        } catch (error) {
            {"endpoint": endpoint, "success": false, "error": error.get("message")};
        };
    });
    results;
};

/* Example API endpoints */
api_endpoints = ["/users", "/products", "/orders"];
api_data = collect_api_data("https://api.example.com", api_endpoints);

/* Process collected data */
successful_data = api_data.filter(op(result) { result.get("success"); });
failed_requests = api_data.filter(op(result) { !result.get("success"); });

("Successfully collected data from " + successful_data.len().str() + " endpoints").echo();
("Failed requests: " + failed_requests.len().str()).echo();

Example: Automated Form Submission

/* Automated form submission with validation */
submit_form = op(form_data) {
    try {
        /* Prepare form data */
        post_data = $JSON().stringify(form_data);

        /* Submit form via POST request */
        response = $net().post("https://example.com/submit", post_data, {
            "Content-Type": "application/json"
        });

        if (response.get("status") == 200) {
            result = $JSON().parse(response.get("body"));
            {"success": true, "result": result};
        } else {
            {"success": false, "error": "HTTP " + response.get("status").str()};
        };
    } catch (error) {
        {"success": false, "error": error.get("message")};
    };
};

/* Example form data */
form_data = {
    "name": "John Doe",
    "email": "john@example.com",
    "message": "Hello from Grapa!"
};

/* Submit the form */
result = submit_form(form_data);
if (result.get("success")) {
    ("Form submitted successfully: " + result.get("result").get("message")).echo();
} else {
    ("Form submission failed: " + result.get("error")).echo();
}