CoreClaw
Store
Pricing
Start Free Trial
Kael Odin

Dataset Deduplication & Merge Tool

Pricing
Try for free
Kael Odin

Dataset Deduplication & Merge Tool

odin-kael/dataset-deduplication-and-merge-tool

Dedup Datasets Worker is a powerful tool for merging and deduplicating datasets from multiple JSON/JSONL files. Fully optimized for the CafeScraper platform with enhanced features and robust error handling.

Try for Free
2,000 Free Results

You can access the Worker programmatically from your own applications using the CoreClaw API. Choose your preferred language from the options below. To get started with the CoreClaw API, you'll need a CoreClaw account and your API token — find it in the overview in your Console.

package main

import (
    "bytes"
    "encoding/json"
    "fmt"
    "io"
    "net/http"
    "time"
)

// API URL
const API_URL = "https://openapi.coreclaw.com/api/v1/scraper/run"

// Your API KEY
const API_KEY = "<YOUR_API_KEY>"

// Callback URL, The endpoint that will receive the scraping results
const CALLBACK_URL = "https://your-domain.com/callback"

// ScraperRunRequest represents the structure for running a scraper request
type ScraperRunRequest struct {
    ScraperSlug string          `json:"scraper_slug"` // Unique identifier for the scraper
    Version     string          `json:"version"`      // Worker version number
    Input       json.RawMessage `json:"input"`        // Input parameters
    CallbackURL string          `json:"callback_url"` // Callback URL
}

// ScraperRunResponse represents the structure for the scraper run response
type ScraperRunResponse struct {
    Code    int    `json:"code"`    // Error code
    Message string `json:"message"` // Error message
    Data    Data   `json:"data"`    // Response data
}

// Data represents the structure for response data
type Data struct {
    RunSlug string `json:"run_slug"` // Unique identifier for the run record
}

func main() {
    // Build request parameters
    req := ScraperRunRequest{
        ScraperSlug: "01KG2DV66JTCN65ZBTRX3M456E",
        Version: "v1.0.8",
        Input: json.RawMessage(`{
          "system": {
              "proxy_region": "",
              "cpus": 0.125,
              "memory": 512,
              "execute_limit_time_seconds": 1800,
              "max_total_charge": 0,
              "max_total_traffic": 0
          },
          "custom": {
          "runUnits": [
                    {
                              "url": "https://coreclaw.local/__single_run__"
                    }
          ],
          "scenario": "ecommerce-products",
          "fields": [
                    {
                              "string": "productId"
                    },
                    {
                              "string": "sku"
                    }
          ],
          "mergeStrategy": "keep-newest",
          "timestampField": "updatedAt",
          "dataSourceType": "direct-input",
          "inputData": "[{\"productId\": \"P001\", \"sku\": \"SKU-A-BLACK\", \"name\": \"无线蓝牙耳机 Pro\", \"price\": 299.00, \"stock\": 156, \"source\": \"京东旗舰店\", \"updatedAt\": \"2024-01-20T10:30:00\"}, {\"productId\": \"P001\", \"sku\": \"SKU-A-BLACK\", \"name\": \"无线蓝牙耳机 Pro (黑)\", \"price\": 279.00, \"stock\": 200, \"source\": \"天猫旗舰店\", \"updatedAt\": \"2024-01-22T14:20:00\"}, {\"productId\": \"P001\", \"sku\": \"SKU-A-WHITE\", \"name\": \"无线蓝牙耳机 Pro\", \"price\": 299.00, \"stock\": 88, \"source\": \"京东旗舰店\", \"updatedAt\": \"2024-01-20T10:30:00\"}, {\"productId\": \"P002\", \"sku\": \"SKU-B\", \"name\": \"智能手表 Ultra\", \"price\": 1299.00, \"stock\": 45, \"source\": \"官网\", \"updatedAt\": \"2024-01-18T09:00:00\"}]",
          "inputUrls": [
                    {
                              "url": "https://raw.githubusercontent.com/kael-odin/worker-dedup-datasets/main/test/data1.json"
                    }
          ],
          "datasetIds": [],
          "inputFormat": "json",
          "output": "unique-items",
          "generateReport": true,
          "mode": "dedup-after-load",
          "fieldsToLoad": [],
          "nullAsUnique": false,
          "parallelLoads": 10,
          "parallelPushes": 5,
          "batchSize": 5000,
          "appendFileSource": false,
          "verboseLog": false
}
        }`),
        CallbackURL: CALLBACK_URL,
    }

    // Send request
    runSlug, err := runScraper(req, API_KEY)
    if err != nil {
        fmt.Printf("Request failed: %v
", err)
        return
    }

    fmt.Printf("Worker run successful!")
    fmt.Printf("Run record ID: %s
", runSlug)
    fmt.Printf("You can use this ID to query run status and results
")
}

// runScraper executes the scraper
func runScraper(req ScraperRunRequest, apiKey string) (string, error) {
    // Serialize request data
    body, err := json.Marshal(req)
    if err != nil {
        return "", fmt.Errorf("failed to serialize request data: %w", err)
    }

    // Create HTTP request
    client := &http.Client{
        Timeout: 30 * time.Second,
    }

    httpReq, err := http.NewRequest(
        "POST",
        API_URL,
        bytes.NewBuffer(body),
    )
    if err != nil {
        return "", fmt.Errorf("failed to create request: %w", err)
    }

    // Set request headers
    httpReq.Header.Set("api-key", apiKey)
    httpReq.Header.Set("Content-Type", "application/json")

    // Send request
    resp, err := client.Do(httpReq)
    if err != nil {
        return "", fmt.Errorf("failed to send request: %w", err)
    }
    defer resp.Body.Close()

    // Read response
    respBody, err := io.ReadAll(resp.Body)
    if err != nil {
        return "", fmt.Errorf("failed to read response: %w", err)
    }

    // Check response status code
    if resp.StatusCode != http.StatusOK {
        return "", fmt.Errorf("request failed, status code: %d, response: %s", resp.StatusCode, string(respBody))
    }

    // Parse response
    var result ScraperRunResponse
    if err := json.Unmarshal(respBody, &result); err != nil {
        return "", fmt.Errorf("failed to parse response: %w", err)
    }

    // Check error code
    if result.Code != 0 {
        return "", fmt.Errorf("business error: %s (error code: %d)", result.Message, result.Code)
    }

    return result.Data.RunSlug, nil
}

Additional Resources

API Reference Documentation
Complete API documentation with all endpoints and parameters

Pricing

Failed results don't count

Rating

5.0

Developer

Kael Odin

Worker Stats

15 Total runs
Success rate: 86.67%
Last updated: Apr 20, 2026

Categories

Google

Share

You might also like

Explore more popular scrapers from our marketplace

View All Scrapers
Google Search Results (SERP) Scraper API

Google Search Results (SERP) Scraper API

by CoreClaw

It queries the Google search engine by keyword and returns a structured SERP summary, including the final search parameters, organic results, related queries, and people-also-ask data.

4.8
590 runs
From $1.2/1,000 results
Google Sheets Import Export Tool

Google Sheets Import Export Tool

by Kael Odin

A powerful Google Sheets data import export tool designed for data synchronization, backup, and integration between Google Sheets and external systems. Supports three operation modes, two authentication methods, batch processing, data deduplication, and automatic backup.

5.0
2 runs
From $1.2/1,000 results
Cheerio Web Scraping

Cheerio Web Scraping

by Kael Odin

A high-speed static page scraper based on Cheerio, designed specifically for static HTML pages. Uses Cheerio for HTML parsing, delivering speeds 10-50 times faster than full browser rendering.

5.0
3 runs
From $1.2/1,000 results
Playwright Web Scraping

Playwright Web Scraping

by Kael Odin

A powerful cross-browser web scraping tool using Playwright for complete browser rendering. Supports Chromium, Firefox, and WebKit browser engines. Perfect for dynamic pages, single-page applications (SPAs), infinite scroll pages, and cross-browser testing scenarios.

5.0
4 runs
From $1.2/1,000 results
View All Scrapers
CoreClaw

Deploy ready-to-use Workers to accelerate your data collection workflows.

Email: support@coreclaw.com

Resources

  • Quick Start
  • API Reference
  • Leads
  • Affiliate Program

Recommend

  • Store
  • Pricing

Address

Apex DataWorks Limited

UNIT 9, 1/F, THE CLOUD, 111 TUNG CHAU STREET, TAI KOK TSUI, KOWLOON,HONG KONG