php实现的中文分词类完整实例

作者:袖梨 2022-06-24

 

 代码如下复制代码

classSegmentation {

  var$options=array('lowercase'=> TRUE,

  'segment_english'=> FALSE);

  var$dict_name='Unknown'

  var$dict_words=array();

  functionsetLowercase($value) {

    if($value) {

      $this->options['lowercase'] = TRUE;

    }else{

      $this->options['lowercase'] = FALSE;

    }

    returnTRUE;

  }

  functionsetSegmentEnglish($value) {

    if($value) {

      $this->options['segment_english'] = TRUE;

    }else{

      $this->options['segment_english'] = FALSE;

    }

    returnTRUE;

  }

  functionload($dict_file) {

    if(!file_exists($dict_file)) {

      returnFALSE;

    }

    $fp=fopen($dict_file,'r');

    $temp=fgets($fp, 1024);

    if($temp=== FALSE) {

      returnFALSE;

    }else{

      if(strpos($temp,"t") !== FALSE) {

        list ($dict_type,$dict_name) =explode("t", trim($temp));

      }else{

        $dict_type= trim($temp);

        $dict_name='Unknown'

      }

      $this->dict_name =$dict_name;

      if($dict_type!=='DICT_WORD_W') {

        returnFALSE;

      }

    }

    while(!feof($fp)) {

      $this->dict_words[rtrim(fgets($fp, 32))] = 1;

    }

    fclose($fp);

    returnTRUE;

  }

  functiongetDictName() {

    return$this->dict_name;

  }

  functionsegmentString($str) {

    if(count($this->dict_words) === 0) {

      returnFALSE;

    }

    $lines=explode("n",$str);

    return$this->_segmentLines($lines);

  }

  functionsegmentFile($filename) {

    if(count($this->dict_words) === 0) {

      returnFALSE;

    }

    $lines= file($filename);

    return$this->_segmentLines($lines);

  }

  function_segmentLines($lines) {

    $contents_segmented=''

    foreach($linesas$line) {

      $contents_segmented.=$this->_segmentLine(rtrim($line)) ." n";

    }

    do{

      $contents_segmented=str_replace(' ',' ',$contents_segmented);

    }

    while(strpos($contents_segmented,' ') !== FALSE);

    return$contents_segmented;

  }

  function_segmentLine($str) {

    $str_final=''

    $str_array=array();

    $str_length=strlen($str);

    if($str_length> 0) {

      if(ord($str{$str_length-1}) >= 129) {

        $str.=' '

      }

    }

    for($i=0;$i<$str_length;$i++) {

      if(ord($str{$i}) >= 129) {

        $str_array[] =$str{$i} .$str{$i+1};

        $i++;

      }else{

        $str_tmp=$str{$i};

        for($j=$i+1;$j<$str_length;$j++) {

          if(ord($str{$j}) < 129) {

            $str_tmp.=$str{$j};

          }else{

            break;

          }

        }

        $str_array[] =array($str_tmp);

        $i=$j- 1;

      }

    }

    $pos=count($str_array);

    while($pos> 0) {

      $char=$str_array[$pos-1];

      if(is_array($char)) {

        $str_final_tmp=$char[0];

        if($this->options['segment_english']) {

          $str_final_tmp= preg_replace("/([!"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~tf]+)/"," $1 ",$str_final_tmp);

$str_final_tmp= preg_replace("/([!"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~tf])([!"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~tf])/"," $1 $2 ",$str_final_tmp);

        }

        if($this->options['lowercase']) {

          $str_final_tmp=strtolower($str_final_tmp);

        }

        $str_final=" $str_final_tmp$str_final";

        $pos--;

      }else{

        $word_found= 0;

        $word_array=array(0 =>'');

        if($pos< 4) {

          $word_temp=$pos+ 1;

        }else{

          $word_temp= 5;

        }

        for($i=1;$i<$word_temp;$i++) {

          $word_array[$i] =$str_array[$pos-$i] .$word_array[$i-1];

        }

        for($i=($word_temp-1);$i>1;$i--) {

          if(array_key_exists($word_array[$i],$this->dict_words)) {

            $word_found=$i;

            break;

          }

        }

        if($word_found) {

          $str_final=" $word_array[$word_found]$str_final";

          $pos=$pos-$word_found;

        }else{

          $str_final=" $char$str_final";

          $pos--;

        }

      }

    }

    return$str_final;

  }

}

?>

 

相关文章

精彩推荐