weixin/article
weixinRead-onlyFetch WeChat MP article by URL — title, author, content, images (HTTP-only, no browser)
mp.weixin.qq.com
Last 7 days
0
Last 30 days
0
All time
0
weixin/article.js
/* @meta
{
"name": "weixin/article",
"description": "Fetch WeChat MP article by URL — title, author, content, images (HTTP-only, no browser)",
"domain": "mp.weixin.qq.com",
"args": {
"url": {"required": true, "description": "WeChat article URL or short ID, e.g. https://mp.weixin.qq.com/s/xxxxx"}
},
"runtime": "http",
"readOnly": true,
"example": "tap site weixin/article url=https://mp.weixin.qq.com/s/DJtp4QUJtJHCnNU9yarsww"
}
*/
async function(args) {
if (!args.url) {
return {error: 'Missing argument: url', hint: 'Provide a WeChat article URL'};
}
let url = args.url.trim();
if (!url.includes('://') && !url.startsWith('//')) {
url = 'https://mp.weixin.qq.com/s/' + url;
} else if (url.startsWith('//')) {
url = 'https:' + url;
}
if (!url.includes('mp.weixin.qq.com')) {
return {error: 'Invalid URL', hint: 'URL must be from mp.weixin.qq.com'};
}
// WeChat Mobile UA bypasses access restrictions without needing a browser
const resp = await fetch(url, {
headers: {
'User-Agent': 'Mozilla/5.0 (Linux; U; Android 4.1.2; zh-cn; MI ONE Plus Build/JZO54K) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30 MicroMessenger/5.0.1.352',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8'
}
});
if (!resp.ok) return {error: 'HTTP ' + resp.status};
const html = await resp.text();
if (html.includes('未知错误') && html.includes('暂无权限查看此页面内容')) {
return {error: 'Access denied', hint: 'WeChat blocked this request. Try a different URL.'};
}
// The page is ~3MB of JS bundles. Work only on small relevant slices to avoid
// running expensive operations on the full string.
// Metadata variables live in a small <script> block containing 'var msg_title'.
// Extract just that block (~20KB) for regex operations.
const metaAnchor = 'var msg_title';
const metaPos = html.indexOf(metaAnchor);
const metaStart = metaPos !== -1 ? html.lastIndexOf('<script', metaPos) : -1;
const metaEnd = metaPos !== -1 ? html.indexOf('</script>', metaPos) : -1;
const metaBlock = (metaStart !== -1 && metaEnd !== -1)
? html.slice(metaStart, metaEnd)
: (metaPos !== -1 ? html.slice(Math.max(0, metaPos - 500), metaPos + 10000) : '');
// Article body lives in the div with id="js_content". Slice 300KB around it.
const contentAnchor = 'id="js_content"';
const contentIdx = html.indexOf(contentAnchor);
const contentBlock = contentIdx !== -1
? html.slice(contentIdx, contentIdx + 300000)
: '';
function decodeEntities(str) {
if (!str) return str;
return str
.replace(/"/g, '"')
.replace(/</g, '<')
.replace(/>/g, '>')
.replace(/&/g, '&')
.replace(/ /g, ' ')
.replace(/'/g, "'");
}
function extractStr(name) {
let m = metaBlock.match(new RegExp('var\\s+' + name + '\\s*=\\s*htmlDecode\\(["\']([^"\']+)["\']\\)\\s*;'));
if (m) return decodeEntities(m[1]);
m = metaBlock.match(new RegExp('var\\s+' + name + '\\s*=\\s*["\']([^"\']+)["\']\\.html\\(false\\)\\s*;'));
if (m) return decodeEntities(m[1]);
m = metaBlock.match(new RegExp('var\\s+' + name + '\\s*=\\s*["\']([^"\']+)["\'][^;]*;'));
if (m) return decodeEntities(m[1]);
return null;
}
function extractNum(name) {
const m = metaBlock.match(new RegExp('var\\s+' + name + '\\s*=\\s*["\']?([0-9]+)["\']?[^;]*;'));
if (m) return parseInt(m[1], 10);
return null;
}
const title = extractStr('msg_title');
const author = extractStr('nickname');
const description = extractStr('msg_desc');
const coverImage = extractStr('msg_cdn_url');
const sourceUrl = extractStr('msg_source_url');
const canonicalUrl = extractStr('msg_link');
const ct = extractNum('ct');
const mid = extractStr('mid');
const idx = extractStr('idx');
const sn = extractStr('sn');
const biz = extractStr('biz');
// Extract content using indexOf (C-speed) instead of char-by-char scanning.
function extractContent(block) {
const marker = 'id="js_content"';
const startIdx = block.indexOf(marker);
if (startIdx === -1) return '';
const tagEnd = block.indexOf('>', startIdx);
if (tagEnd === -1) return '';
const contentStart = tagEnd + 1;
let depth = 1;
let pos = contentStart;
while (depth > 0) {
const openIdx = block.indexOf('<div', pos);
const closeIdx = block.indexOf('</div>', pos);
if (closeIdx === -1) break;
if (openIdx !== -1 && openIdx < closeIdx) {
depth++;
pos = openIdx + 4;
} else {
depth--;
if (depth === 0) return block.slice(contentStart, closeIdx);
pos = closeIdx + 6;
}
}
return '';
}
const contentHtml = extractContent(contentBlock);
let content = '';
if (contentHtml) {
content = contentHtml
.replace(/<br\s*\/?>/gi, '\n')
.replace(/<\/p>/gi, '\n')
.replace(/<p[^>]*>/gi, '')
.replace(/<section[^>]*>/gi, '\n')
.replace(/<\/section>/gi, '')
.replace(/<span[^>]*>/gi, '')
.replace(/<\/span>/gi, '')
.replace(/<strong[^>]*>/gi, '**')
.replace(/<\/strong>/gi, '**')
.replace(/<b[^>]*>/gi, '**')
.replace(/<\/b>/gi, '**')
.replace(/<em[^>]*>/gi, '*')
.replace(/<\/em>/gi, '*')
.replace(/<i[^>]*>/gi, '*')
.replace(/<\/i>/gi, '*')
.replace(/<h[1-6][^>]*>/gi, '\n')
.replace(/<\/h[1-6]>/gi, '\n')
.replace(/<img[^>]*>/gi, '')
.replace(/<[^>]+>/g, '')
.replace(/ /g, ' ')
.replace(/</g, '<')
.replace(/>/g, '>')
.replace(/&/g, '&')
.replace(/"/g, '"')
.replace(/'/g, "'")
.replace(/\n{3,}/g, '\n\n')
.trim();
}
const images = [];
const imgRe = /<img[^>]+(?:data-src|src)=["']([^"']+)["']/g;
let imgMatch;
while ((imgMatch = imgRe.exec(contentHtml)) !== null) {
const imgUrl = imgMatch[1];
if (imgUrl && !imgUrl.startsWith('data:') && images.indexOf(imgUrl) === -1) {
images.push(imgUrl);
}
}
return {
title,
author,
description,
publish_time: ct ? new Date(ct * 1000).toISOString() : null,
publish_timestamp: ct,
cover_image: coverImage,
source_url: sourceUrl,
canonical_url: canonicalUrl,
url,
content,
images: images.slice(0, 30),
meta: {biz, mid, idx, sn}
};
}
Updated May 23, 2026Created May 23, 2026SHA-256: dec86c3694b7…