Pricing
from $4.99 / 1,000 results
๐ง Smart Article Extractor
Pricing
from $4.99 / 1,000 results
Rating
0.0
(0)
Developer
Actor stats
0
Bookmarked
2
Total users
1
Monthly active users
24 days ago
Last modified
Categories
Share
![]() |
VOOZH | about |
Pricing
from $4.99 / 1,000 results
Pricing
from $4.99 / 1,000 results
Rating
0.0
(0)
Developer
Actor stats
0
Bookmarked
2
Total users
1
Monthly active users
24 days ago
Last modified
Categories
Share
You can access the ๐ง Smart Article Extractor programmatically from your own applications by using the Apify API. You can also choose the language preference from below. To use the Apify API, youโll need an Apify account and your API token, found in Integrations settings in Apify Console.
{"openapi":"3.0.1","info":{"version":"0.3","x-build-id":"TvaCmQgjsQalPmyJ2"},"servers":[{"url":"https://api.apify.com/v2"}],"paths":{"/acts/scrapier~smart-article-extractor/run-sync-get-dataset-items":{"post":{"operationId":"run-sync-get-dataset-items-scrapier-smart-article-extractor","x-openai-isConsequential":false,"summary":"Executes an Actor, waits for its completion, and returns Actor's dataset items in response.","tags":["Run Actor"],"requestBody":{"required":true,"content":{"application/json":{"schema":{"$ref":"#/components/schemas/inputSchema"}}}},"parameters":[{"name":"token","in":"query","required":true,"schema":{"type":"string"},"description":"Enter your Apify token here"}],"responses":{"200":{"description":"OK"}}}},"/acts/scrapier~smart-article-extractor/runs":{"post":{"operationId":"runs-sync-scrapier-smart-article-extractor","x-openai-isConsequential":false,"summary":"Executes an Actor and returns information about the initiated run in response.","tags":["Run Actor"],"requestBody":{"required":true,"content":{"application/json":{"schema":{"$ref":"#/components/schemas/inputSchema"}}}},"parameters":[{"name":"token","in":"query","required":true,"schema":{"type":"string"},"description":"Enter your Apify token here"}],"responses":{"200":{"description":"OK","content":{"application/json":{"schema":{"$ref":"#/components/schemas/runsResponseSchema"}}}}}}},"/acts/scrapier~smart-article-extractor/run-sync":{"post":{"operationId":"run-sync-scrapier-smart-article-extractor","x-openai-isConsequential":false,"summary":"Executes an Actor, waits for completion, and returns the OUTPUT from Key-value store in response.","tags":["Run Actor"],"requestBody":{"required":true,"content":{"application/json":{"schema":{"$ref":"#/components/schemas/inputSchema"}}}},"parameters":[{"name":"token","in":"query","required":true,"schema":{"type":"string"},"description":"Enter your Apify token here"}],"responses":{"200":{"description":"OK"}}}}},"components":{"schemas":{"inputSchema":{"type":"object","required":["startUrls"],"properties":{"startUrls":{"title":"๐ Website / Category URLs","type":"array","description":"Top-level pages the crawler should start from โ homepages, sections, topic pages. Each one is treated as a category page and articles are discovered from it.","items":{"type":"string"}},"articleUrls":{"title":"๐ฐ Article URLs","type":"array","description":"Already-known article URLs to extract directly (no discovery needed). Mix with Website URLs for hybrid runs.","items":{"type":"string"}},"onlyNewArticles":{"title":"๐ Only new articles (only for small runs)","type":"boolean","description":"Skip articles that were extracted in any previous run (deduplicated globally via the key-value store). Best for low-volume runs.","default":false},"onlyNewArticlesPerDomain":{"title":"๐ Only new articles (saved per domain, preferable)","type":"boolean","description":"Same as above, but the deduplication memory is kept separately per domain โ preferable for multi-domain runs.","default":false},"onlyInsideArticles":{"title":"๐ Only inside domain articles","type":"boolean","description":"When enqueueing from an article, accept only links that point back to the same registrable domain.","default":true},"enqueueFromArticles":{"title":"๐งญ Enqueue articles from articles","type":"boolean","description":"Discover further article links inside extracted articles and add them to the crawl queue.","default":false},"crawlWholeSubdomain":{"title":"๐ธ๏ธ Crawl whole subdomain (same base as Start URL)","type":"boolean","description":"Treat every same-subdomain link as a potential category page (depth-limited).","default":false},"onlySubdomainArticles":{"title":"๐ท๏ธ Limit articles to only from subdomain","type":"boolean","description":"Restrict articles to URLs starting with the same path prefix as the Start URL (e.g. example.com/news/*).","default":false},"scanSitemaps":{"title":"๐บ๏ธ Find articles in sitemaps (caution)","type":"boolean","description":"Discover article URLs from robots.txt โ Sitemap entries and the usual /sitemap.xml candidates. Disable if it produces too many noisy candidates.","default":false},"sitemapUrls":{"title":"๐บ๏ธ Sitemap URLs (safer)","type":"array","description":"Explicit sitemap URLs โ skips auto-discovery and only uses these. Safer than full robots.txt scanning.","default":[],"items":{"type":"string"}},"saveHtml":{"title":"๐พ Save full HTML","type":"boolean","description":"Include the full page HTML in the dataset record (produces large records).","default":false},"saveHtmlAsLink":{"title":"๐ Save full HTML (only as link to it)","type":"boolean","description":"Save HTML to the run's key-value store and put the link in the record (smaller dataset).","default":false},"saveSnapshots":{"title":"๐ธ Save screenshots of article pages (browser only)","type":"boolean","description":"Take a PNG screenshot of every article. Only effective when the headless browser is enabled.","default":false},"useGoogleBotHeaders":{"title":"๐ค Use Googlebot headers","type":"boolean","description":"Send the Googlebot User-Agent + headers. Many publishers allow Googlebot through paywalls / soft-blocks.","default":false},"minWords":{"title":"๐ Minimum words","minimum":0,"maximum":100000,"type":"integer","description":"Reject articles whose extracted text has fewer than this many words.","default":150},"dateFrom":{"title":"๐ Extract articles from [date]","type":"string","description":"ISO date (YYYY-MM-DD). Only keep articles published on or after this date."},"onlyArticlesForLastDays":{"title":"๐ Only articles for last X days","minimum":0,"maximum":3650,"type":"integer","description":"Drop anything older than X days. Combined with dateFrom, the stricter of the two wins."},"mustHaveDate":{"title":"๐
Must have date","type":"boolean","description":"Drop articles where no publication-date metadata can be detected.","default":true},"isUrlArticleDefinition":{"title":"๐งช Is the URL an article?","type":"object","description":"Heuristics for classifying a URL as an article. minDashes = minimum dashes in the path, hasDate = path contains a /YYYY/MM/DD/ pattern, linkIncludes = substrings that mark a URL as an article.","default":{"minDashes":4,"hasDate":true,"linkIncludes":["article","storyid","?p=","id=","/fpss/track",".html","/content/"]}},"pseudoUrls":{"title":"๐งฉ Pseudo URLs","type":"array","description":"Additional URL patterns ([.*], [\\d+]) that mark a page as a crawlable category. If you want to enqueue direct article URLs this way, you have to add { \"label\": \"article\" } to the userData.","default":[],"items":{"type":"object","required":["purl"],"properties":{"purl":{"type":"string","title":"Pseudo-URL of a web page"}}}},"linkSelector":{"title":"๐ฏ Link selector","type":"string","description":"Optional CSS selector restricting which parts of a category page contribute links (e.g. main a, .article-list a)."},"maxDepth":{"title":"๐ช Max depth","minimum":0,"maximum":20,"type":"integer","description":"Maximum BFS depth from the Start URL (Start URL = 0). Empty = no extra cap.","default":2},"maxPagesPerCrawl":{"title":"๐ Max pages per crawl","minimum":1,"maximum":100000,"type":"integer","description":"Hard cap on pages fetched in one run (articles + category pages combined).","default":50},"maxArticlesPerCrawl":{"title":"โจ Max articles per crawl","minimum":1,"maximum":100000,"type":"integer","description":"Hard cap on extracted articles per run.","default":25},"maxArticlesPerStartUrl":{"title":"๐ฏ Max articles per start URL","minimum":1,"maximum":100000,"type":"integer","description":"Cap how many articles can be attributed to a single Start URL.","default":25},"maxConcurrency":{"title":"โก Max concurrency","minimum":1,"maximum":100,"type":"integer","description":"How many fetches the crawler may run in parallel. Higher = faster, but more pressure on the target site and proxy quota. Leave empty for safe sequential mode.","default":1},"proxyConfiguration":{"title":"๐ก๏ธ Proxy configuration","type":"object","description":"Proxy settings. Default = NO PROXY (direct). If the target blocks the request, the actor automatically falls back to DATACENTER, then RESIDENTIAL (with up to 3 retries on residential). Once a fallback occurs, it sticks."},"useBrowser":{"title":"๐ญ Use browser (Playwright)","type":"boolean","description":"Render with Chromium when raw HTTP fails or the page is JS-heavy. Slower but bypasses many anti-bot walls.","default":false},"pageWaitMs":{"title":"โฑ๏ธ Wait on each page (ms)","minimum":0,"maximum":60000,"type":"integer","description":"Extra time to wait after navigation finishes (milliseconds). Useful for lazily-loaded scripts.","default":0},"waitUntil":{"title":"๐ฆ Wait until navigation event is finished","enum":["load","domcontentloaded","networkidle","commit"],"type":"string","description":"Which navigation event Playwright waits for before considering the page ready.","default":"load"},"categoryWaitForSelector":{"title":"๐๏ธ Wait for selector on each category page","type":"string","description":"Optional CSS selector. The browser will wait for this element to appear before extracting links from category pages."},"articleWaitForSelector":{"title":"๐ฐ Wait for selector on each article page","type":"string","description":"Optional CSS selector. The browser will wait for this element to appear before extracting article content."},"scrollToBottom":{"title":"๐ฑ๏ธ Scroll to bottom of the page (infinite scroll)","type":"boolean","description":"Auto-scroll to the bottom of category/article pages so lazy-loaded content is rendered.","default":false},"scrollToBottomButtonSelector":{"title":"๐ Scroll to bottom button selector","type":"string","description":"Optional CSS selector for a 'Load more' button. The crawler will click it repeatedly while scrolling."},"scrollToBottomMaxSeconds":{"title":"โฒ๏ธ Scroll to bottom max seconds","minimum":1,"maximum":600,"type":"integer","description":"Maximum time spent scrolling per page (safety cap).","default":60},"extendOutputFunction":{"title":"๐ ๏ธ Extend output function","type":"string","description":"Only needed if you want more data than is included in the default output. Keep in mind that you should provide a valid Python function: def extend(soup, article, html): return {...}. The returned dict is merged into each article record."},"maxCUs":{"title":"๐งฎ Limit CU consumption","minimum":0,"maximum":100000,"type":"integer","description":"Soft cap on Apify Compute Units this run may consume. The actor checks usage between requests and exits gracefully when the cap is hit. Leave empty for no cap."},"notificationEmails":{"title":"๐ง Emails address for notifications","type":"array","description":"Email addresses to notify when the CU thresholds below are crossed.","default":[],"items":{"type":"string"}},"notifyAfterCUs":{"title":"๐ Notify after [number] CUs","minimum":0,"maximum":100000,"type":"integer","description":"Send a one-time notification once this many CUs have been consumed."},"notifyAfterCUsEvery":{"title":"๐ Notify every [number] CUs","minimum":0,"maximum":100000,"type":"integer","description":"Send a notification every N CUs after the initial notification threshold."}}},"runsResponseSchema":{"type":"object","properties":{"data":{"type":"object","properties":{"id":{"type":"string"},"actId":{"type":"string"},"userId":{"type":"string"},"startedAt":{"type":"string","format":"date-time","example":"2025-01-08T00:00:00.000Z"},"finishedAt":{"type":"string","format":"date-time","example":"2025-01-08T00:00:00.000Z"},"status":{"type":"string","example":"READY"},"meta":{"type":"object","properties":{"origin":{"type":"string","example":"API"},"userAgent":{"type":"string"}}},"stats":{"type":"object","properties":{"inputBodyLen":{"type":"integer","example":2000},"rebootCount":{"type":"integer","example":0},"restartCount":{"type":"integer","example":0},"resurrectCount":{"type":"integer","example":0},"computeUnits":{"type":"integer","example":0}}},"options":{"type":"object","properties":{"build":{"type":"string","example":"latest"},"timeoutSecs":{"type":"integer","example":300},"memoryMbytes":{"type":"integer","example":1024},"diskMbytes":{"type":"integer","example":2048}}},"buildId":{"type":"string"},"defaultKeyValueStoreId":{"type":"string"},"defaultDatasetId":{"type":"string"},"defaultRequestQueueId":{"type":"string"},"buildNumber":{"type":"string","example":"1.0.0"},"containerUrl":{"type":"string"},"usage":{"type":"object","properties":{"ACTOR_COMPUTE_UNITS":{"type":"integer","example":0},"DATASET_READS":{"type":"integer","example":0},"DATASET_WRITES":{"type":"integer","example":0},"KEY_VALUE_STORE_READS":{"type":"integer","example":0},"KEY_VALUE_STORE_WRITES":{"type":"integer","example":1},"KEY_VALUE_STORE_LISTS":{"type":"integer","example":0},"REQUEST_QUEUE_READS":{"type":"integer","example":0},"REQUEST_QUEUE_WRITES":{"type":"integer","example":0},"DATA_TRANSFER_INTERNAL_GBYTES":{"type":"integer","example":0},"DATA_TRANSFER_EXTERNAL_GBYTES":{"type":"integer","example":0},"PROXY_RESIDENTIAL_TRANSFER_GBYTES":{"type":"integer","example":0},"PROXY_SERPS":{"type":"integer","example":0}}},"usageTotalUsd":{"type":"number","example":0.00005},"usageUsd":{"type":"object","properties":{"ACTOR_COMPUTE_UNITS":{"type":"integer","example":0},"DATASET_READS":{"type":"integer","example":0},"DATASET_WRITES":{"type":"integer","example":0},"KEY_VALUE_STORE_READS":{"type":"integer","example":0},"KEY_VALUE_STORE_WRITES":{"type":"number","example":0.00005},"KEY_VALUE_STORE_LISTS":{"type":"integer","example":0},"REQUEST_QUEUE_READS":{"type":"integer","example":0},"REQUEST_QUEUE_WRITES":{"type":"integer","example":0},"DATA_TRANSFER_INTERNAL_GBYTES":{"type":"integer","example":0},"DATA_TRANSFER_EXTERNAL_GBYTES":{"type":"integer","example":0},"PROXY_RESIDENTIAL_TRANSFER_GBYTES":{"type":"integer","example":0},"PROXY_SERPS":{"type":"integer","example":0}}}}}}}}}}OpenAPI is a standard for designing and describing RESTful APIs, allowing developers to define API structure, endpoints, and data formats in a machine-readable way. It simplifies API development, integration, and documentation.
OpenAPI is effective when used with AI agents and GPTs by standardizing how these systems interact with various APIs, for reliable integrations and efficient communication.
By defining machine-readable API specifications, OpenAPI allows AI models like GPTs to understand and use varied data sources, improving accuracy. This accelerates development, reduces errors, and provides context-aware responses, making OpenAPI a core component for AI applications.
You can download the OpenAPI definitions for ๐ง Smart Article Extractor from the options below:
If youโd like to learn more about how OpenAPI powers GPTs, read our blog post.
You can also check out our other API clients: