C#使用分词算法从文本字符串中抽取关键词模拟百度搜索|CSFramework.COM巨献
C#使用分词算法从文本字符串中抽取关键词模拟百度搜索|CSFramework.COM巨献
*** 感谢网友提供的基础源码,C/S框架进行优化修订 *** 优化内容: 1. KeywordSpliter类仅开放一个公共静态方法;方便调用; 2. 修改数处bug; 3. 支持自定义关键词库; 4. 自定义关键词前置,网页的keywords前置关键词便于搜索引擎seo优化和收录。 KeywordSpliter类完整版源码: C# Code: /// <summary> /// C#使用分词算法从文本中抽取关键词|CSFramework.COM优化修订 /// </summary> public static class KeywordSpliter { #region 属性 private static string _SplitChar = " ";//分隔符 //用于移除停止词 private readonly static string[] _StopWordsList = new string[] {"的", "我们","要","自己","之","将","“","”",",","(",")","后","应","到","某","后", "个","是","位","新","一","两","在","中","或","有","更","好" }; #endregion //加载keywords_default.dic文本文件数据缓存 private static SortedList _KeywordsCacheDefault = null; //加载keywords_baidu.dic文本文件数据缓存,自定义dic文件(百度关键词,或自定义关键词) private static SortedList _KeywordsCacheBaidu = null; /// <summary> /// 得到分词关键字,以逗号隔开 /// </summary> /// <param name="keyText"></param> /// <returns></returns> public static string DoGetKeyword(string keyText) { if (String.IsNullOrEmpty(keyText)) return ""; LoadDict(); LoadDictBaidu(); #region 默认词库分词 StringBuilder sb = new StringBuilder(); ArrayList _key = SplitToList(keyText); Dictionary<string, int> distinctDict = SortByDuplicateCount(_key); foreach (KeyValuePair<string, int> pair in distinctDict) { sb.Append(pair.Key + ","); } #endregion #region 添加百度关键词,或自定义关键词 //若是单个长词关键词, 添加百度关键词,或自定义关键词 //bool baidu = _KeywordsCacheBaidu.ContainsKey(keyText); if (!distinctDict.ContainsKey(keyText) && _KeywordsCacheBaidu.ContainsKey(keyText)) { sb.Insert(0, keyText + ",");//前置关键词,seo较好 } else //枚举自定义词库 { string value; foreach (DictionaryEntry key in _KeywordsCacheBaidu) { value = key.Value.ToString(); if (keyText.IndexOf(value) >= 0 && !distinctDict.ContainsKey(value)) sb.Insert(0, value + ",");//前置关键词,seo较好 } } #endregion return sb.ToString(); } // #region 读取文本 private static SortedList LoadDictFile(string FilePath) { Encoding encoding = Encoding.GetEncoding("utf-8"); SortedList arrText = new SortedList(); // try { if (!File.Exists(FilePath)) { arrText.Add("0", "文件" + FilePath + "不存在..."); } else { StreamReader objReader = new StreamReader(FilePath, encoding); string sLine = ""; while (sLine != null) { sLine = objReader.ReadLine(); if (!String.IsNullOrEmpty(sLine)) arrText.Add(sLine, sLine); } objReader.Close(); objReader.Dispose(); } } catch (Exception ex) { } return arrText; } #endregion #region 载入词典 /// <summary> /// 加载字典文件,并缓存到变量 /// </summary> private static SortedList LoadDict() { string filePath = GetPhysicalFilePath("keywords_default.dic"); if (_KeywordsCacheDefault == null) _KeywordsCacheDefault = LoadDictFile(filePath); return _KeywordsCacheDefault; } private static SortedList LoadDictBaidu() { string filePath = GetPhysicalFilePath("keywords_baidu.dic"); if (_KeywordsCacheBaidu == null) _KeywordsCacheBaidu = LoadDictFile(filePath); return _KeywordsCacheBaidu; } /// <summary> /// 获取物理文件路径 /// </summary> /// <param name="dictFileName">文件名,如:keywords_baidu.dic</param> /// <returns></returns> private static string GetPhysicalFilePath(string dictFileName) { //判断是Web服务器环境 if (System.Web.HttpContext.Current != null) { string filePath = System.Web.HttpContext.Current.Server.MapPath("~/bin/" + dictFileName); return filePath; } else//其他环境,Winform环境 { string dir = Path.GetDirectoryName(typeof(KeywordSpliter).Assembly.Location); string filePath = Path.Combine(dir, dictFileName); return filePath; } } #endregion // #region 正则检测 private static bool IsMatch(string str, string reg) { return new Regex(reg).IsMatch(str); } #endregion // #region 首先格式化字符串(粗分) private static string FormatStr(string val) { string result = ""; if (val == null || val == "") return ""; // char[] CharList = val.ToCharArray(); // string Spc = _SplitChar;//分隔符 int StrLen = CharList.Length; int CharType = 0; //0-空白 1-英文 2-中文 3-符号 // for (int i = 0; i < StrLen; i++) { string StrList = CharList[i].ToString(); if (StrList == null || StrList == "") continue; // if (CharList[i] < 0x81) { #region if (CharList[i] < 33) { if (CharType != 0 && StrList != "\n" && StrList != "\r") { result += " "; CharType = 0; } continue; } else if (IsMatch(StrList, "[^0-9a-zA-Z@\\.%#:/\\&_-]"))//排除这些字符 { if (CharType == 0) result += StrList; else result += Spc + StrList; CharType = 3; } else { if (CharType == 2 || CharType == 3) { result += Spc + StrList; CharType = 1; } else { if (IsMatch(StrList, "[@%#:]")) { result += StrList; CharType = 3; } else { result += StrList; CharType = 1; }//end if No.4 }//end if No.3 }//end if No.2 #endregion }//if No.1 else { //如果上一个字符为非中文和非空格,则加一个空格 if (CharType != 0 && CharType != 2) result += Spc; //如果是中文标点符号 if (!IsMatch(StrList, "^[\u4e00-\u9fa5]+$")) { if (CharType != 0) result += Spc + StrList; else result += StrList; CharType = 3; } else //中文 { result += StrList; CharType = 2; } } //end if No.1 }//exit for // return result; } #endregion // #region 分词 /// <summary> /// 分词 /// </summary> /// <param name="key">关键词</param> /// <returns></returns> private static ArrayList StringSpliter(string[] key) { ArrayList List = new ArrayList(); try { SortedList dict = LoadDict();//载入词典 // for (int i = 0; i < key.Length; i++) { if (IsMatch(key[i], @"^(?!^\.$)([a-zA-Z0-9\.\u4e00-\u9fa5]+)$")) //中文、英文、数字 { if (IsMatch(key[i], "^[\u4e00-\u9fa5]+$"))//如果是纯中文 { int keyLen = key[i].Length; if (keyLen < 2) continue; else if (keyLen <= 7) List.Add(key[i]); // //开始分词 for (int x = 0; x < keyLen; x++) { //x:起始位置//y:结束位置 for (int y = x; y < keyLen; y++) { string val = key[i].Substring(x, keyLen - y); if (val == null || val.Length < 2) break; else if (val.Length > 10) continue; if (dict.Contains(val)) List.Add(val); } // } // } else if (!IsMatch(key[i], @"^(\.*)$"))//不全是小数点 { List.Add(key[i]); } } } } catch (Exception ex) { } return List; } #endregion #region 得到分词结果 /// <summary> /// 得到分词结果 /// </summary> /// <param name="keyText"></param> /// <returns></returns> private static ArrayList SplitToList(string keyText) { ArrayList KeyList = StringSpliter(FormatStr(keyText).Split(_SplitChar.ToCharArray())); //去掉没用的词 for (int i = 0; i < KeyList.Count; i++) { if (IsStopword(KeyList[i].ToString())) { KeyList.RemoveAt(i); } } return KeyList; } /// <summary> /// 把一个集合按重复次数排序 /// </summary> /// <typeparam name="T"></typeparam> /// <param name="inputList"></param> /// <returns></returns> private static Dictionary<string, int> SortByDuplicateCount(ArrayList inputList) { //用于计算每个元素出现的次数,key是元素,value是出现次数 Dictionary<string, int> distinctDict = new Dictionary<string, int>(); for (int i = 0; i < inputList.Count; i++) { //这里没用trygetvalue,会计算两次hash if (distinctDict.ContainsKey(inputList[i].ToString())) distinctDict[inputList[i].ToString()]++; else distinctDict.Add(inputList[i].ToString(), 1); } Dictionary<string, int> sortByValueDict = GetSortByValueDict(distinctDict); return sortByValueDict; } /// <summary> /// 把一个字典value的顺序排序 /// </summary> /// <typeparam name="K"></typeparam> /// <typeparam name="V"></typeparam> /// <param name="distinctDict"></param> /// <returns></returns> private static Dictionary<K, V> GetSortByValueDict<K, V>(IDictionary<K, V> distinctDict) { //用于给tempDict.Values排序的临时数组 V[] tempSortList = new V[distinctDict.Count]; distinctDict.Values.CopyTo(tempSortList, 0); Array.Sort(tempSortList); //给数据排序 Array.Reverse(tempSortList);//反转 //用于保存按value排序的字典 Dictionary<K, V> sortByValueDict = new Dictionary<K, V>(distinctDict.Count); for (int i = 0; i < tempSortList.Length; i++) { foreach (KeyValuePair<K, V> pair in distinctDict) { //比较两个泛型是否相当要用Equals,不能用==操作符 if (pair.Value.Equals(tempSortList[i]) && !sortByValueDict.ContainsKey(pair.Key)) sortByValueDict.Add(pair.Key, pair.Value); } } return sortByValueDict; } #endregion private static bool IsStopword(string str) { return _StopWordsList.Contains(str); } } //来源:C/S框架网(www.csframework.com) QQ:23404761 测试案例: C# Code: class Program { static void Main(string[] args) { //测试案例 string s0 = KeywordSpliter.DoGetKeyword("CS架构系统快速开发框架旗舰版V5.0|C/S框架网"); string s1 = KeywordSpliter.DoGetKeyword("Winform快速开发框架|C/S框架网"); string s2 = KeywordSpliter.DoGetKeyword(".NET服务端WebApi快速开发框架|C/S框架网"); string s3 = KeywordSpliter.DoGetKeyword("Web B/S架构快速开发框架|C/S框架网"); string s4 = KeywordSpliter.DoGetKeyword("测试其他:aaa bb ccccc b/s c/s 111 222 333 a.b.c 10.11222"); string s5 = KeywordSpliter.DoGetKeyword("C/S框架网 - www.csframework.com"); Console.WriteLine(s0 + "\r\n"); Console.WriteLine(s1 + "\r\n"); Console.WriteLine(s2 + "\r\n"); Console.WriteLine(s3 + "\r\n"); Console.WriteLine(s4 + "\r\n"); Console.WriteLine(s5 + "\r\n"); Console.ReadKey(); } } //来源:C/S框架网(www.csframework.com) QQ:23404761 扫一扫加微信:
参考文档:
C#日期类型转换工具(时间戳,字符串,long,byte类型互转) C#模拟百度搜索长词自动语义匹配,使用分词算法抽取关键词 模拟搜索引擎中文自动分词算法精华(CSFramework特别提供C#源码) C#推送链接URL到百度搜索资源平台提高收录量 C#使用StreamWriter在大文本文件末尾添加一行内容 模拟百度搜索渲染HTML页面关键词高亮分组排序算法(C#) CSFramework模拟百度搜索引擎自动语义分析分词算法(C#) 官网搜索引擎SEO,百度关键词SEO,搜索分词系统一体化解决方案 C#类扩展方法,字符串对象转换,常用扩展方法StringExtensions C#推送URL链接到百度搜索资源平台快速收录URL网址 C# GUID序列号转换生成唯一的16位短字符串或19位长数字序号 C# MD5字符串文本加密 C# 将GUID转换为最长16位的短字符串序号 C#判断对象类型为值类型、字符串、对象引用类型或泛型 C# 将数组、集合、可枚举类型转换成逗号分隔的字符串(String.Join)
其它资料:
什么是C/S结构? | C/S框架核心组成部分 | C/S框架-WebService部署图 | C/S框架-权限管理 | C/S结构系统框架 - 5.1旗舰版介绍 | C/S结构系统框架 - 功能介绍 | C/S结构系统框架 - 产品列表 | C/S结构系统框架 - 应用展示(图) | 三层体系架构详解 | C/S架构轻量级快速开发框架 | C/S框架网客户案例 | WebApi快速开发框架 | C/S框架代码生成器 | 用户授权注册软件系统 | 版本自动升级软件 | 数据库底层应用框架 | CSFramework.CMS内容管理系统 | |