mirror of
https://github.com/RSS-Bridge/rss-bridge.git
synced 2025-02-10 06:12:25 +00:00
147 lines
4.8 KiB
PHP
147 lines
4.8 KiB
PHP
<?php
|
|
|
|
class AnthropicBridge extends BridgeAbstract
|
|
{
|
|
const MAINTAINER = 'sqrtminusone';
|
|
const NAME = 'Anthropic Research Bridge';
|
|
const URI = 'https://www.anthropic.com';
|
|
|
|
const CACHE_TIMEOUT = 3600; // 1 hour
|
|
const DESCRIPTION = 'Returns research publications from Anthropic';
|
|
|
|
const PARAMETERS = [
|
|
'' => [
|
|
'limit' => [
|
|
'name' => 'Limit',
|
|
'type' => 'number',
|
|
'required' => true,
|
|
'defaultValue' => 10
|
|
],
|
|
]
|
|
];
|
|
|
|
public function collectData()
|
|
{
|
|
// Anthropic sometimes returns 500 for no reason. The contents are still there.
|
|
$html = $this->getHTMLIgnoreError(self::URI . '/research');
|
|
$limit = $this->getInput('limit');
|
|
|
|
$page_data = $this->extractPageData($html);
|
|
$pages = $this->parsePageData($page_data);
|
|
for ($i = 0; $i < min(count($pages), $limit); $i++) {
|
|
$page = $pages[$i];
|
|
$page['content'] = $this->parsePage($page['uri']);
|
|
$this->items[] = $page;
|
|
}
|
|
}
|
|
|
|
private function getHTMLIgnoreError($url, $ttl = null)
|
|
{
|
|
if ($ttl != null) {
|
|
$cacheKey = 'pages_' . $url;
|
|
$content = $this->cache->get($cacheKey);
|
|
if ($content) {
|
|
return str_get_html($content);
|
|
}
|
|
}
|
|
|
|
try {
|
|
$content = getContents($url);
|
|
} catch (HttpException $e) {
|
|
$content = $e->response->getBody();
|
|
}
|
|
if ($ttl != null) {
|
|
$this->cache->set($cacheKey, $content, $ttl);
|
|
}
|
|
return str_get_html($content);
|
|
}
|
|
|
|
private function extractPageData($html)
|
|
{
|
|
foreach ($html->find('script') as $script) {
|
|
$js_code = $script->innertext;
|
|
if (!str_starts_with($js_code, 'self.__next_f.push(')) {
|
|
continue;
|
|
}
|
|
$push_data = (string)json_decode(mb_substr($js_code, 22, mb_strlen($js_code) - 2 - 22));
|
|
$square_bracket = mb_strpos($push_data, '[');
|
|
$push_array = json_decode(mb_substr($push_data, $square_bracket), true);
|
|
if ($push_array == null || count($push_array) < 4) {
|
|
continue;
|
|
}
|
|
$page_data = $push_array[3];
|
|
if ($page_data != null && array_key_exists('page', $page_data)) {
|
|
return $page_data;
|
|
}
|
|
}
|
|
}
|
|
|
|
private function parsePageData($page_data)
|
|
{
|
|
$result = [];
|
|
foreach ($page_data['page']['sections'] as $section) {
|
|
if (
|
|
!array_key_exists('internalName', $section) ||
|
|
$section['internalName'] != 'Research Teams'
|
|
) {
|
|
continue;
|
|
}
|
|
foreach ($section['tabPages'] as $tabPage) {
|
|
if ($tabPage['label'] != 'Overview') {
|
|
continue;
|
|
}
|
|
foreach ($tabPage['sections'] as $section1) {
|
|
if (
|
|
!array_key_exists('title', $section1)
|
|
|| $section1['title'] != 'Publications'
|
|
) {
|
|
continue;
|
|
}
|
|
foreach ($section1['posts'] as $post) {
|
|
$enc = [];
|
|
if ($post['cta'] != null && array_key_exists('url', $post['cta'])) {
|
|
$enc = [$post['cta']['url']];
|
|
}
|
|
$result[] = [
|
|
'title' => $post['title'],
|
|
'timestamp' => $post['publishedOn'],
|
|
'uri' => self::URI . '/research/' . $post['slug']['current'],
|
|
'categories' => array_map(
|
|
fn($s) => $s['label'],
|
|
$post['subjects'],
|
|
),
|
|
'enclosures' => $enc,
|
|
];
|
|
}
|
|
break;
|
|
}
|
|
break;
|
|
}
|
|
break;
|
|
}
|
|
return $result;
|
|
}
|
|
|
|
private function parsePage($url)
|
|
{
|
|
// Again, 500 for no reason.
|
|
$html = $this->getHTMLIgnoreError($url, 7 * 24 * 60 * 60);
|
|
|
|
$content = '';
|
|
|
|
// Main content
|
|
$main = $html->find('div[class*="PostDetail_post-detail"] > article', 0);
|
|
|
|
// Mostly YouTube videos
|
|
$iframes = $main->find('iframe');
|
|
foreach ($iframes as $iframe) {
|
|
$iframe->parent->removeAttribute('style');
|
|
$iframe->outertext = '<a href="' . $iframe->src . '">' . $iframe->src . '</a>';
|
|
}
|
|
|
|
$main = convertLazyLoading($main);
|
|
$main = defaultLinkTo($main, self::URI);
|
|
$content .= $main;
|
|
return $content;
|
|
}
|
|
}
|