tap
Back

toutiao/search

toutiaoRead-only

今日头条搜索

so.toutiao.com
Last 7 days
0
Last 30 days
0
All time
0
toutiao/search.js
/* @meta
{
  "name": "toutiao/search",
  "description": "今日头条搜索",
  "domain": "so.toutiao.com",
  "args": {
    "query": {"required": true, "description": "搜索关键词"},
    "count": {"required": false, "description": "返回结果数量 (默认 10, 最多 20)"}
  },
  "readOnly": true,
  "example": "tap site toutiao/search AI"
}
*/

async function(args) {
  if (!args.query) return {error: 'Missing argument: query', hint: 'Provide a search keyword'};
  const count = Math.min(parseInt(args.count) || 10, 20);

  const url = 'https://so.toutiao.com/search?keyword=' + encodeURIComponent(args.query) + '&pd=information&dvpf=pc';
  const resp = await fetch(url, {credentials: 'include'});
  if (!resp.ok) return {error: 'HTTP ' + resp.status, hint: 'Open so.toutiao.com in tap first'};

  const html = await resp.text();
  const parser = new DOMParser();
  const doc = parser.parseFromString(html, 'text/html');

  const results = [];

  // Helper: extract clean article URL from jump redirect chain
  function extractArticleUrl(href) {
    if (!href) return '';
    try {
      // Decode nested jump URLs to find the real toutiao article URL
      let decoded = href;
      for (let i = 0; i < 5; i++) {
        const match = decoded.match(/toutiao\.com(?:%2F|\/)+a?(\d{15,})/);
        if (match) return 'https://www.toutiao.com/article/' + match[1] + '/';
        const groupMatch = decoded.match(/group(?:%2F|\/)(\d{15,})/);
        if (groupMatch) return 'https://www.toutiao.com/article/' + groupMatch[1] + '/';
        decoded = decodeURIComponent(decoded);
      }
    } catch (e) {}
    return href;
  }

  // Strategy 1: SSR HTML uses cs-card containers
  const cards = doc.querySelectorAll('.cs-card');
  for (const card of cards) {
    const titleLink = card.querySelector('a[href*="search/jump"]');
    if (!titleLink) continue;

    const title = (titleLink.textContent || '').trim();
    if (!title || title.length < 2) continue;
    // Skip non-result links like "去西瓜搜" / "去抖音搜"
    if (title.includes('去西瓜搜') || title.includes('去抖音搜')) continue;

    const articleUrl = extractArticleUrl(titleLink.getAttribute('href') || '');

    // Extract snippet & source & time from card text
    const fullText = (card.textContent || '').trim();
    // Remove the title (may appear twice) to get the rest
    let rest = fullText;
    const titleIdx = rest.indexOf(title);
    if (titleIdx >= 0) rest = rest.substring(titleIdx + title.length);
    // Remove second occurrence of title if present
    const titleIdx2 = rest.indexOf(title);
    if (titleIdx2 >= 0) rest = rest.substring(titleIdx2 + title.length);
    rest = rest.trim();

    let snippet = '';
    let source = '';
    let time = '';

    // Remove trailing comment count like "1评论" or "23评论" first
    rest = rest.replace(/\d+评论/g, '').trim();

    // Extract time from the tail first
    // Time patterns: "3天前", "12小时前", "5分钟前", "前天17:23", "昨天08:00", "2024-01-01"
    // The number-based patterns (N天前 etc.) must NOT be preceded by a digit
    const timeMatch = rest.match(/((?<=[^\d])|^)(\d{1,2}(?:小时|分钟|天)前|前天[\d:]*|昨天[\d:]*|\d{4}[-/.]\d{2}[-/.]\d{2}.*)$/);
    if (timeMatch) {
      time = timeMatch[2] ? timeMatch[2].trim() : timeMatch[0].trim();
      rest = rest.substring(0, rest.length - timeMatch[0].length).trim();
    }

    // Source is the short text at the end (author/media name, typically 2-20 chars)
    // Pattern: "...snippet content...SourceName"
    const sourceMatch = rest.match(/^([\s\S]+?)([\u4e00-\u9fa5A-Za-z][\u4e00-\u9fa5A-Za-z0-9_\s]{1,19})$/);
    if (sourceMatch && sourceMatch[1].length > 10) {
      snippet = sourceMatch[1].trim().substring(0, 300);
      source = sourceMatch[2].trim();
    } else {
      snippet = rest.substring(0, 300);
    }

    results.push({title, snippet, source, time, url: articleUrl});
    if (results.length >= count) break;
  }

  // Strategy 2: Fallback to finding jump links with article IDs
  if (results.length === 0) {
    const links = doc.querySelectorAll('a[href*="search/jump"]');
    for (const link of links) {
      const text = (link.textContent || '').trim();
      if (!text || text.length < 4) continue;
      // Skip navigation/promo links
      if (text.includes('去西瓜搜') || text.includes('去抖音搜') || text.includes('APP')) continue;

      const href = link.getAttribute('href') || '';
      // Only include links that point to actual articles
      if (!href.match(/toutiao\.com|group|a\d{10,}/)) continue;

      const articleUrl = extractArticleUrl(href);
      if (results.some(r => r.title === text)) continue;

      // Try to get snippet from sibling/parent context
      let snippet = '';
      const container = link.closest('[class*="card"]') || link.parentElement?.parentElement;
      if (container) {
        const containerText = (container.textContent || '').trim();
        const afterTitle = containerText.indexOf(text);
        if (afterTitle >= 0) {
          const rest = containerText.substring(afterTitle + text.length).trim();
          if (rest.length > 10) snippet = rest.substring(0, 300);
        }
      }

      results.push({title: text, snippet, source: '', time: '', url: articleUrl});
      if (results.length >= count) break;
    }
  }

  if (results.length === 0) {
    return {
      error: 'No results found',
      hint: 'Toutiao may require login or has anti-scraping protection. Try: 1) Open so.toutiao.com in tap first, 2) Log in to toutiao, 3) Use toutiao/hot instead',
      query: args.query
    };
  }

  return {
    query: args.query,
    count: results.length,
    results
  };
}
Updated Mar 31, 2026Created Mar 31, 2026SHA-256: 494c10277b73