配置algolia
_1、注册账号
首先需要去 algolia 官网注册自己的账号,可以直接使用 Github 或者其他邮箱注册登录。
新账号会自动创建一个Application ,也可以自己创建一个新的
点击确定后接着下一个页面继续点击create Application
按钮,然后点击NEXT
就创建了一个新的Application,创建完成后点击
Skip for now
,不然会根据域名生成index名称,爬虫配置也不对
_2、新建index
按下图步骤创建index
_3、验证域名
按要求验证域名
然后点击Skip for now
进行下一步骤
_4、配置爬虫
按下图所示新建爬虫
新建后点击爬虫名称进入爬虫配置
爬虫配置如下,将域名
和indexName
改成自己配置的,配置完成后点击右上角start Crawling
开始爬取网站内容
_1、只爬取正文
new Crawler({
appId: "",
apiKey: "",
indexPrefix: "",
rateLimit: 8,
startUrls: ["https://vp.xiaoying.org.cn/"],
renderJavaScript: false,
sitemaps: ["https://vp.xiaoying.org.cn/sitemap.xml"],
exclusionPatterns: [],
ignoreCanonicalTo: false,
discoveryPatterns: ["https://vp.xiaoying.org.cn/**"],
schedule: "on the first day of the month",
actions: [
{
indexName: "blog",
pathsToMatch: ["https://vp.xiaoying.org.cn/**"],
recordExtractor: ({ $, helpers }) => {
return helpers.docsearch({
recordProps: {
lvl1: ".content h1",
content: ".content p, .content li",
lvl0: {
selectors: "section.has-active div h2",
defaultValue: "Documentation",
},
lvl2: ".content h2",
lvl3: ".content h3",
lvl4: ".content h4",
lvl5: ".content h5",
},
indexHeadings: true,
});
},
},
],
initialIndexSettings: {
blog: {
attributesForFaceting: ["type", "lang"],
attributesToRetrieve: ["hierarchy", "content", "anchor", "url"],
attributesToHighlight: ["hierarchy", "hierarchy_camel", "content"],
attributesToSnippet: ["content:10"],
camelCaseAttributes: ["hierarchy", "hierarchy_radio", "content"],
searchableAttributes: [
"unordered(hierarchy_radio_camel.lvl0)",
"unordered(hierarchy_radio.lvl0)",
"unordered(hierarchy_radio_camel.lvl1)",
"unordered(hierarchy_radio.lvl1)",
"unordered(hierarchy_radio_camel.lvl2)",
"unordered(hierarchy_radio.lvl2)",
"unordered(hierarchy_radio_camel.lvl3)",
"unordered(hierarchy_radio.lvl3)",
"unordered(hierarchy_radio_camel.lvl4)",
"unordered(hierarchy_radio.lvl4)",
"unordered(hierarchy_radio_camel.lvl5)",
"unordered(hierarchy_radio.lvl5)",
"content",
],
distinct: true,
attributeForDistinct: "url",
customRanking: [
"desc(weight.pageRank)",
"desc(weight.level)",
"asc(weight.position)",
],
ranking: [
"words",
"filters",
"typo",
"attribute",
"proximity",
"exact",
"custom",
],
highlightPreTag: '<span class="algolia-highlight">',
highlightPostTag: "</span>",
minWordSizefor1Typo: 3,
minWordSizefor2Typos: 7,
allowTyposOnNumericTokens: false,
minProximity: 1,
ignorePlurals: true,
advancedSyntax: true,
removeWordsIfNoResults: "allOptional",
},
},
});
_2、同时爬取代码块
正文和代码块分开
new Crawler({
appId: "",
apiKey: "",
rateLimit: 8,
startUrls: ["https://vp.xiaoying.org.cn/"],
renderJavaScript: false,
sitemaps: ["https://vp.xiaoying.org.cn/sitemap.xml"],
exclusionPatterns: ["https://vp.xiaoying.org.cn/config/theme"],
ignoreCanonicalTo: false,
discoveryPatterns: ["https://vp.xiaoying.org.cn/**"],
schedule: "on the first day of the month",
actions: [
{
indexName: "blog",
pathsToMatch: ["https://vp.xiaoying.org.cn/**"],
recordExtractor: ({ url, $, helpers }) => {
const contentSelector = ".content";
const $content = $(contentSelector);
if (!$content.length) return [];
// === 1) 统一给元素编序 + 挂载 data-order,顺手建 pre 前缀->序号 表 ===
const elementOrderMap = new Map();
let currentOrder = 1;
// 生成与之前一致的 key(但仅用于内部 Map,不再做选择器查询)
const makeKey = ($elem) => {
return `${$elem.prop("tagName")}-${$elem
.text()
.substring(0, 10)
.replace(/[\s"']/g, "")}-${$elem.parent().attr("class") || ""}`;
};
$content.find("h2, h3, p, li, pre").each((_, elem) => {
const $elem = $(elem);
const elemKey = makeKey($elem);
elementOrderMap.set(elemKey, currentOrder);
// 给原始 DOM 打上序号,后面就不用 :contains 了
$elem.attr("data-order", String(currentOrder));
currentOrder++;
});
// 收集标题 + 范围
const titleElements = [];
$content.find("h2[id], h3[id]").each((_, elem) => {
const $title = $(elem);
const titleId = ($title.attr("id") || "").trim();
const titleKey = makeKey($title);
const titleOrder = elementOrderMap.get(titleKey) || null;
const titleText = $title.text().trim();
if (titleId && titleOrder)
titleElements.push({
id: titleId,
order: titleOrder,
text: titleText,
});
});
titleElements.sort((a, b) => a.order - b.order);
const titleRanges = [];
titleElements.forEach((t, i) => {
const next = titleElements[i + 1];
const endOrder = next ? next.order - 1 : currentOrder;
titleRanges.push({ id: t.id, startOrder: t.order, endOrder });
});
const getAnchorByOrder = (ord) => {
for (const r of titleRanges) {
if (ord >= r.startOrder && ord <= r.endOrder) return r.id;
}
return titleRanges.length ? titleRanges[0].id : "";
};
// ⚠️ 关键:建立 “pre 文本前缀 -> 序号” 映射(不再用 :contains)
const prePrefixToOrder = new Map();
$content.find("pre").each((_, pre) => {
const $pre = $(pre);
const original = $pre.text() || "";
const prefix = original.substring(0, 80); // 用较长前缀降低碰撞
const key = makeKey($pre);
const ord = elementOrderMap.get(key);
if (ord) prePrefixToOrder.set(prefix, ord);
});
// === 2) 提取并清理代码片段,靠 prefix 映射回序号/锚点 ===
const codeSnippets = helpers
.codeSnippets({
tag: "pre",
languageClassPrefix: "language-",
})
.filter((s) => s.content.length < 500)
.map((s) => {
const originalContent = s.content;
const codePreview = originalContent
.replace(/\/\*[\s\S]*?\*\//g, "") // 清多行注释
.replace(/<\([\s\S]*?\)/g, "") // 清 Bash <( ... )
.replace(/\/\/[^\n\r]*/g, "") // 清 // 注释
.replace(/#[^\n\r]*/g, "") // 清 # 注释
.replace(/\s+/g, " ")
.trim();
const order =
prePrefixToOrder.get(originalContent.substring(0, 80)) || null;
const anchor = order
? getAnchorByOrder(order)
: titleRanges[0]
? titleRanges[0].id
: "";
return {
...s,
content: codePreview, // 用清理后的内容入库
anchor,
codeUrl: anchor ? `${url}#${anchor}` : url,
};
});
// === 3) 文档记录(保持你的逻辑,但避免 :contains) ===
const docRecords = helpers.docsearch({
recordProps: {
lvl1: ".content h1",
content: ".content p, .content li",
lvl0: {
selectors: "section.has-active div h2",
defaultValue: "Documentation",
},
lvl2: ".content h2",
lvl3: ".content h3",
lvl4: ".content h4",
lvl5: ".content h5",
},
indexHeadings: true,
});
const processedRecords = docRecords.map((record) => {
if (!record.lvl2 && record.hierarchy && record.hierarchy.lvl2) {
return { ...record, lvl2: record.hierarchy.lvl2 };
}
if (!record.lvl3 && record.hierarchy && record.hierarchy.lvl3) {
return { ...record, lvl3: record.hierarchy.lvl3 };
}
return record;
});
// 构建 “标题文本 -> id” 表,避免使用 :contains
const headingTextToId = [];
$content.find("h2[id], h3[id]").each((_, h) => {
const $h = $(h);
headingTextToId.push({ text: $h.text().trim(), id: $h.attr("id") });
});
const findIdByExactText = (txt) => {
txt = (txt || "").trim();
for (const it of headingTextToId) if (it.text === txt) return it.id;
return "";
};
const cleanContent = (raw) => {
if (!raw) return "";
// 1. 去掉非法标签
let clean = raw.replace(/<[^\w\/]+>/g, "");
// 2. 转义 HTML 实体(如果需要)
clean = clean
.replace(/&/g, "&")
.replace(/</g, "<")
.replace(/>/g, ">");
return clean;
};
return processedRecords.map((record) => {
let anchor = "";
if (record.lvl2) anchor = findIdByExactText(record.lvl2) || anchor;
if (!anchor && record.lvl3)
anchor = findIdByExactText(record.lvl3) || anchor;
// 兜底:根据内容所在元素的 data-order 计算最近标题
if (!anchor && record.content) {
let contentOrder = -1;
$content.find("p, li, pre").each((_, el) => {
const $el = $(el);
if (($el.text() || "").indexOf(record.content) !== -1) {
contentOrder =
parseInt($el.attr("data-order") || "0", 10) || -1;
return false; // 取第一个命中即可
}
});
anchor =
contentOrder !== -1
? getAnchorByOrder(contentOrder)
: titleRanges[0]
? titleRanges[0].id
: "";
}
return {
...record,
content: cleanContent(record.content), // 正文
code: cleanContent(codeSnippets.map((c) => c.content).join("\n")), // 代码
anchor,
url: anchor ? `${url}#${anchor}` : url,
};
});
},
},
],
initialIndexSettings: {
blog: {
attributesForFaceting: ["type", "lang"],
attributesToRetrieve: ["hierarchy", "content", "anchor", "url", "code"],
attributesToHighlight: [
"hierarchy",
"hierarchy_camel",
"content",
"code",
],
attributesToSnippet: ["content:10", "code:10"],
camelCaseAttributes: ["hierarchy", "hierarchy_radio", "content", "code"],
searchableAttributes: [
"unordered(hierarchy_radio_camel.lvl0)",
"unordered(hierarchy_radio.lvl0)",
"unordered(hierarchy_radio_camel.lvl1)",
"unordered(hierarchy_radio.lvl1)",
"unordered(hierarchy_radio_camel.lvl2)",
"unordered(hierarchy_radio.lvl2)",
"unordered(hierarchy_radio_camel.lvl3)",
"unordered(hierarchy_radio.lvl3)",
"unordered(hierarchy_radio_camel.lvl4)",
"unordered(hierarchy_radio.lvl4)",
"unordered(hierarchy_radio_camel.lvl5)",
"unordered(hierarchy_radio.lvl5)",
"unordered(hierarchy_radio_camel.lvl6)",
"unordered(hierarchy_radio.lvl6)",
"unordered(hierarchy_camel.lvl0)",
"unordered(hierarchy.lvl0)",
"unordered(hierarchy_camel.lvl1)",
"unordered(hierarchy.lvl1)",
"unordered(hierarchy_camel.lvl2)",
"unordered(hierarchy.lvl2)",
"unordered(hierarchy_camel.lvl3)",
"unordered(hierarchy.lvl3)",
"unordered(hierarchy_camel.lvl4)",
"unordered(hierarchy.lvl4)",
"unordered(hierarchy_camel.lvl5)",
"unordered(hierarchy.lvl5)",
"unordered(hierarchy_camel.lvl6)",
"unordered(hierarchy.lvl6)",
"content",
"code",
],
distinct: true,
attributeForDistinct: "url",
customRanking: [
"desc(weight.pageRank)",
"desc(weight.level)",
"asc(weight.position)",
],
ranking: [
"words",
"filters",
"typo",
"attribute",
"proximity",
"exact",
"custom",
],
highlightPreTag: '<span class="algolia-docsearch-suggestion--highlight">',
highlightPostTag: "</span>",
minWordSizefor1Typo: 3,
minWordSizefor2Typos: 7,
allowTyposOnNumericTokens: false,
minProximity: 1,
ignorePlurals: true,
advancedSyntax: true,
attributeCriteriaComputedByMinProximity: true,
removeWordsIfNoResults: "allOptional",
},
},
});
正文和代码块合并
new Crawler({
appId: "",
apiKey: "",
rateLimit: 8,
startUrls: ["https://vp.xiaoying.org.cn/"],
renderJavaScript: false,
sitemaps: ["https://vp.xiaoying.org.cn/sitemap.xml"],
exclusionPatterns: ["https://vp.xiaoying.org.cn/config/theme"],
ignoreCanonicalTo: false,
discoveryPatterns: ["https://vp.xiaoying.org.cn/**"],
schedule: "on the first day of the month",
actions: [
{
indexName: "blog",
pathsToMatch: ["https://vp.xiaoying.org.cn/**"],
recordExtractor: ({ url, $, helpers }) => {
const contentSelector = ".content";
const $content = $(contentSelector);
if (!$content.length) return [];
// === 1) 元素排序 ===
const elementOrderMap = new Map();
let currentOrder = 1;
const makeKey = ($elem) =>
`${$elem.prop("tagName")}-${$elem
.text()
.substring(0, 10)
.replace(/[\s"']/g, "")}-${$elem.parent().attr("class") || ""}`;
$content.find("h2, h3, p, li, pre").each((_, elem) => {
const $elem = $(elem);
const elemKey = makeKey($elem);
elementOrderMap.set(elemKey, currentOrder);
$elem.attr("data-order", String(currentOrder));
currentOrder++;
});
// === 2) 标题范围映射 ===
const titleElements = [];
$content.find("h2[id], h3[id]").each((_, elem) => {
const $title = $(elem);
const titleId = ($title.attr("id") || "").trim();
const titleKey = makeKey($title);
const titleOrder = elementOrderMap.get(titleKey) || null;
const titleText = $title.text().trim();
if (titleId && titleOrder)
titleElements.push({
id: titleId,
order: titleOrder,
text: titleText,
});
});
titleElements.sort((a, b) => a.order - b.order);
const titleRanges = [];
titleElements.forEach((t, i) => {
const next = titleElements[i + 1];
const endOrder = next ? next.order - 1 : currentOrder;
titleRanges.push({ id: t.id, startOrder: t.order, endOrder });
});
const getAnchorByOrder = (ord) => {
for (const r of titleRanges) {
if (ord >= r.startOrder && ord <= r.endOrder) return r.id;
}
return titleRanges.length ? titleRanges[0].id : "";
};
// === 3) 提取代码片段 ===
const prePrefixToOrder = new Map();
$content.find("pre").each((_, pre) => {
const $pre = $(pre);
const original = $pre.text() || "";
const prefix = original.substring(0, 80);
const key = makeKey($pre);
const ord = elementOrderMap.get(key);
if (ord) prePrefixToOrder.set(prefix, ord);
});
const codeSnippets = helpers
.codeSnippets({
tag: "pre",
languageClassPrefix: "language-",
})
.filter((s) => s.content.length < 500)
.map((s) => {
const originalContent = s.content;
const codePreview = originalContent
.replace(/\/\*[\s\S]*?\*\//g, "")
.replace(/<\([\s\S]*?\)/g, "")
.replace(/\/\/[^\n\r]*/g, "")
.replace(/#[^\n\r]*/g, "")
.replace(/\s+/g, " ")
.trim();
return { ...s, content: codePreview };
});
// === 4) 提取文档记录 ===
const docRecords = helpers.docsearch({
recordProps: {
lvl1: ".content h1",
content: ".content p, .content li",
lvl0: {
selectors: "section.has-active div h2",
defaultValue: "Documentation",
},
lvl2: ".content h2",
lvl3: ".content h3",
lvl4: ".content h4",
lvl5: ".content h5",
},
indexHeadings: true,
});
const processedRecords = docRecords.map((record) => {
if (!record.lvl2 && record.hierarchy && record.hierarchy.lvl2) {
return { ...record, lvl2: record.hierarchy.lvl2 };
}
if (!record.lvl3 && record.hierarchy && record.hierarchy.lvl3) {
return { ...record, lvl3: record.hierarchy.lvl3 };
}
return record;
});
// === 5) 清理函数 ===
const headingTextToId = [];
$content.find("h2[id], h3[id]").each((_, h) => {
const $h = $(h);
headingTextToId.push({ text: $h.text().trim(), id: $h.attr("id") });
});
const findIdByExactText = (txt) => {
txt = (txt || "").trim();
for (const it of headingTextToId) if (it.text === txt) return it.id;
return "";
};
const cleanContent = (raw) => {
if (!raw) return "";
let clean = raw.replace(/<[^\w\/]+>/g, "");
clean = clean
.replace(/&/g, "&")
.replace(/</g, "<")
.replace(/>/g, ">");
return clean;
};
// === 6) 合并正文 + 代码 ===
return processedRecords.map((record) => {
let anchor = "";
if (record.lvl2) anchor = findIdByExactText(record.lvl2) || anchor;
if (!anchor && record.lvl3)
anchor = findIdByExactText(record.lvl3) || anchor;
if (!anchor && record.content) {
let contentOrder = -1;
$content.find("p, li, pre").each((_, el) => {
const $el = $(el);
if (($el.text() || "").indexOf(record.content) !== -1) {
contentOrder =
parseInt($el.attr("data-order") || "0", 10) || -1;
return false;
}
});
anchor =
contentOrder !== -1
? getAnchorByOrder(contentOrder)
: titleRanges[0]
? titleRanges[0].id
: "";
}
const mergedContent = [
cleanContent(record.content || ""),
cleanContent(codeSnippets.map((c) => c.content).join("\n")),
]
.filter(Boolean)
.join("\n\n");
return {
...record,
content: mergedContent,
anchor,
url: anchor ? `${url}#${anchor}` : url,
};
});
},
},
],
initialIndexSettings: {
blog: {
attributesForFaceting: ["type", "lang"],
attributesToRetrieve: ["hierarchy", "content", "anchor", "url"],
attributesToHighlight: ["hierarchy", "hierarchy_camel", "content"],
attributesToSnippet: ["content:10"],
camelCaseAttributes: ["hierarchy", "hierarchy_radio", "content"],
searchableAttributes: [
"unordered(hierarchy_radio_camel.lvl0)",
"unordered(hierarchy_radio.lvl0)",
"unordered(hierarchy_radio_camel.lvl1)",
"unordered(hierarchy_radio.lvl1)",
"unordered(hierarchy_radio_camel.lvl2)",
"unordered(hierarchy_radio.lvl2)",
"unordered(hierarchy_radio_camel.lvl3)",
"unordered(hierarchy_radio.lvl3)",
"unordered(hierarchy_radio_camel.lvl4)",
"unordered(hierarchy_radio.lvl4)",
"unordered(hierarchy_radio_camel.lvl5)",
"unordered(hierarchy_radio.lvl5)",
"unordered(hierarchy_radio_camel.lvl6)",
"unordered(hierarchy_radio.lvl6)",
"unordered(hierarchy_camel.lvl0)",
"unordered(hierarchy.lvl0)",
"unordered(hierarchy_camel.lvl1)",
"unordered(hierarchy.lvl1)",
"unordered(hierarchy_camel.lvl2)",
"unordered(hierarchy.lvl2)",
"unordered(hierarchy_camel.lvl3)",
"unordered(hierarchy.lvl3)",
"unordered(hierarchy_camel.lvl4)",
"unordered(hierarchy.lvl4)",
"unordered(hierarchy_camel.lvl5)",
"unordered(hierarchy.lvl5)",
"unordered(hierarchy_camel.lvl6)",
"unordered(hierarchy.lvl6)",
"content",
],
distinct: true,
attributeForDistinct: "url",
customRanking: [
"desc(weight.pageRank)",
"desc(weight.level)",
"asc(weight.position)",
],
ranking: [
"words",
"filters",
"typo",
"attribute",
"proximity",
"exact",
"custom",
],
highlightPreTag: '<span class="algolia-docsearch-suggestion--highlight">',
highlightPostTag: "</span>",
minWordSizefor1Typo: 3,
minWordSizefor2Typos: 7,
allowTyposOnNumericTokens: false,
minProximity: 1,
ignorePlurals: true,
advancedSyntax: true,
attributeCriteriaComputedByMinProximity: true,
removeWordsIfNoResults: "allOptional",
},
},
});
_5、索引设置
回到搜索页面看是否有数据
接着配置索引,选择要搜索的内容
接着配置facets,这是实现高级搜索和筛选功能的核心特性之一,主要作用是帮助用户快速缩小搜索范围,提升搜索体验,这里要重点注意
lang
必须被选择,否则网页搜索为空
_6、代码配置
按图所示找到下面的配置
export default defineConfig({
...
lang: "zh-CN",
...
themeConfig: ({
...
search: {
provider: 'algolia',
options: {
appId: '...',
apiKey: '...',
indexName: '...'
},
}
...
})
})
最后更新于 2025-10-02 15:54