很多API对文字的长度有限制,因此,在面对长文本的时候,需要将其分段.
PHP实现
算法上无任何难度可言.实现的要点是将中文作为完整字符分割,因为中文utf8编码需要三个字节以上.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
| function splitTextIntoArry($string, $maxLen=90, & $outputs) { preg_match_all("/./us", $string, $match); $len = count($match[0]); if($len < $maxLen) { array_push($outputs, $string); } else { $cnt = 0; $subcnt = 0; $outstr=""; $substr=""; foreach($match[0] as $m) { $substr .= $m; $cnt ++; $subcnt ++; if($m == "," || $m == "." ||$m == "?" || $m == "!" || $m == "!"||$m == "," || $m == "。" || $m == "?") { //join $outstr .= $substr; $substr=""; $subcnt=0; } if($cnt >=$maxLen) { if($outstr != "") { array_push($outputs, $outstr); $outstr = ""; $cnt = $subcnt; } else if($substr != "") { array_push($outputs, $substr); $substr=""; $subcnt=0; $cnt = 0; } } } if($outstr != "" || $substr != "" ) array_push($outputs, $substr . $outstr); } }
|
AD