#!/bin/bash## Licensed to the Apache software Foundation (ASF) under one or more# contributor license agreements. See the NOTICE file distributed with# This work for additional information regarding copyright ownership.# the ASF license s This file to you under the Apache License, Version 2.0# (the "License"); Except in compliance with# the License. Obtain a copy of the License at## http://www.apache.org/licenses/LICENSE-2.0## unless required by applicable l AW or agreed to writing, software# distributed under the License are distributed on a "as is" basis,# without warrantie S or CONDITIONS of any KIND, either express OR implied.# see the License for the specific language governing permissions a nd# limitations under the license.### Downloads the Reuters dataset and prepares it for clustering## to run:change into The Mahout directory and type:# Examples/bin/cluster-reuters.shif ["$" = "--help"] | | ["$" = "--?"]; Then echo the This script CLusters the Reuters data set using a variety of algorithms. The data set is downloaded automatically. " exitfiscript_path=${0%/*}if ["$" = "$SCRIPT _path"] && ["$SCRIPT _path"! = ""]; Then CD $SCRIPT _pathfistart_path= ' pwd ' # Set commands for Dfssource ${start_path}/set-dfs-commands.shmahout= ". /.. /bin/mahout "if [!-e $MAHOUT]; Then echo "Can ' t find mahout driver in $MAHOUT, CWD ' pwd ', exiting ..." Exit 1fiif [[-Z ' $MAHOUT _work_dir]]; Then Work_dir=/tmp/mahout-work-${user}else work_dir= $MAHOUT _work_dirfialgorithm= (Kmeans Fuzzykmeans LDA Streamingkmeans clean) If [-N ']; Then Choice=$1else echo "Please select a number to choose the corresponding clustering algorithm" echo "1. ${algorithm[0]} clustering (runs from this example script in cluster mode only) "echo" 2. ${ALGORITHM[1]} clustering (may require increased heap space on yarn) "echo" 3. ${ALGORITHM[2]} Clustering "echo" 4. ${ALGORITHM[3]} Clustering "echo" 5. ${ALGORITHM[4]}--Cleans up the work aRea in $WORK _dir "Read-p" Enter Your choice: "Choicefiecho" OK. You chose $choice and we'll use ${algorithm[$choice-1]} clustering "clustertype=${algorithm[$choice -1]}if [" x$ Clustertype "= =" Xclean "]; Then RM-RF $WORK _dir $DFSRM $WORK _dir exit 1else $DFS-mkdir-p $WORK _dir mkdir-p $WORK _dir echo ' Creating work di Rectory at ${work_dir} "Fiif [!-e ${work_dir}/reuters-out-seqdir]; Then if [!-e ${work_dir}/reuters-out]; Then if [!-e ${WORK_DIR}/REUTERS-SGM]; Then if [!-f ${work_dir}/reuters21578.tar.gz]; Then if [-N "$"]; Then echo "Copying Reuters from local download" CP $ ${work_dir}/reuters21578.tar.gz Else echo "D Ownloading Reuters-21578 "Curl Http://kdd.ics.uci.edu/databases/reuters21578/reuters21578.tar.gz-o ${work_di r}/reuters21578.tar.gz fi fi #make sure it was actually downloaded if [!-f ${work_dir}/reuters21578.tar. GZ]; Then echo "Failed to download Reuters" Exit 1 fi mkdir -P ${WORK_DIR}/REUTERS-SGM echo "extracting ..." Tar xzf ${work_dir}/reuters21578.tar.gz-c ${work_dir}/reuters- SGM fi echo "Extracting Reuters" $MAHOUT org.apache.lucene.benchmark.utils.ExtractReuters ${WORK_DIR}/REUTERS-SGM ${work_dir}/reuters-out if ["$HADOOP _home"! = ""] && ["$MAHOUT _local" = = ""]; Then echo "Copying Reuters data to Hadoop" set +e $DFSRM ${work_dir}/reuters-sgm $DFSRM ${work _dir}/reuters-out $DFS-mkdir-p ${work_dir}/$DFS-mkdir ${work_dir}/reuters-sgm $DFS-mkdir ${work_ Dir}/reuters-out $DFS-put ${work_dir}/reuters-sgm ${work_dir}/reuters-sgm $DFS-put ${work_dir}/reuters-out ${work_dir}/reuters-out set-e fi fi echo "converting to Sequence Files from Directory" $MAHOUT seqdirectory -I ${work_dir}/reuters-out-o ${work_dir}/reuters-out-seqdir-c UTF-8-chunk 64-xm sequentialfiif ["x$clustertype" = = " Xkmeans "]; Then $MAHOUT seq2sparse-i ${work_dir}/reuters-out-seqdir/-o ${work_dir}/reuters-out-seqdir-sparse-kmeans--maxdfpercent---namedVector && $MAHOUT kmeans-i ${work_dir}/reuters-out-seqdir-sparse-kmeans/tfidf-vectors/-C ${work_dir}/reuters-kmeans-clus Ters-o ${WORK_DIR}/REUTERS-KMEANS-DM Org.apache.mahout.common.distance.euclideandistancemeasure-x 10-k 20 -ow--clustering && $MAHOUT clusterdump-i ' $DFS-ls-d ${work_dir}/reuters-kmeans/clusters-*-final | awk ' {print $8} '-O ${work_dir}/reuters-kmeans/clusterdump-d ${work_dir}/reuters-out-seqdir-sparse-kmeans/dictio nary.file-0-dt sequencefile-b 100-n--evaluate-dm org.apache.mahout.common.distance.euclideandistancemeasure-s P 0--pointsdir ${work_dir}/reuters-kmeans/clusteredpoints && cat ${work_dir}/reuters-kmeans/clusterdump elif ["x$clustertype" = = "Xfuzzykmeans"]; Then $MAHOUT seq2sparse-i ${work_dir}/reuters-out-seqdir/-o ${work_dir}/reuters-out-seqdir-sparSe-fkmeans--maxdfpercent--namedvector && $MAHOUT fkmeans-i ${work_dir}/reuters-out-seqdir-sparse-fkm eans/tfidf-vectors/-C ${work_dir}/reuters-fkmeans-clusters-o ${WORK_DIR}/REUTERS-FKMEANS-DM Org.apache.maho Ut.common.distance.euclideandistancemeasure-x 10-k 20-ow-m 1.1 && $MAHOUT clusterdump-i ${work_di R}/reuters-fkmeans/clusters-*-final-o ${work_dir}/reuters-fkmeans/clusterdump-d ${WORK_DIR}/reuters-out-seqdir-s parse-fkmeans/dictionary.file-0-dt sequencefile-b 100-n 20-sp 0 && Cat ${work_dir}/reuters-fkmeans/c Lusterdumpelif ["x$clustertype" = = "Xlda"]; Then $MAHOUT seq2sparse-i ${work_dir}/reuters-out-seqdir/-o ${work_dir}/reuters-out-seqdir-sparse-lda-ow--max Dfpercent--namedvector && $MAHOUT rowid-i ${work_dir}/reuters-out-seqdir-sparse-lda/tfidf-vectors -O ${work_dir}/reuters-out-matrix && rm-rf ${work_dir}/reuters-lda ${work_dir}/reuters-lda-topics ${work_dir}/reuters-lda-model && $MAHOUT cvb-i ${work_dir}/reuters-out-matrix/matrix-o $ {work_dir}/reuters-lda-k 20-ow-x 20-dict ${work_dir}/reuters-out-seqdir-sparse-lda/dictionary.file-*-dt ${WOR K_DIR}/REUTERS-LDA-TOPICS-MT ${work_dir}/reuters-lda-model && $MAHOUT vectordump-i ${work_dir}/reute Rs-lda-topics/part-m-00000-o ${work_dir}/reuters-lda/vectordump-vs 10-p true-d ${WORK_DIR}/reuters-out-seq dir-sparse-lda/dictionary.file-*-dt sequencefile-sort ${work_dir}/reuters-lda-topics/part-m-00000 && C At ${work_dir}/reuters-lda/vectordumpelif ["x$clustertype" = = "Xstreamingkmeans"]; Then $MAHOUT seq2sparse-i ${work_dir}/reuters-out-seqdir/-o ${work_dir}/reuters-out-seqdir-sparse-streamingkmea Ns-ow--maxdfpercent--namedvector && rm-rf ${work_dir}/reuters-streamingkmeans && $MAHOUT St Reamingkmeans-i ${work_dir}/reuters-out-seqdir-sparse-streamingkmeans/tfidf-vectors/--tempdir ${work_dir}/tmp-o ${WORK_DIR}/REUTERS-STREAMINGKMEANS-SC ORG.APACHE.M Ahout.math.neighborhood.fastprojectionsearch-dm Org.apache.mahout.common.distance.squaredeuclideandistancemeasure-k 10-km 100-ow && $MAHOUT qualcluster -I. ${WORK_DIR}/REUTERS-OUT-SEQDIR-SPARSE-STREAMINGKMEANS/TFIDF-VECTORS/PART-R-00000-C ${WORK_DIR}/reuters-stream Ingkmeans/part-r-00000-o ${work_dir}/reuters-cluster-distance.csv && Cat ${work_dir}/reuters-cluster- Distance.csvfi
With the upgrade of the Mahout version, the algorithm libraries and examples are becoming more and more perfect, we use our own script to learn how to cluster
cluster-reuters.sh automatically download the data from Reuters, we can run it directly, the script automatically copy the data to Hadoop, so before running the Hadoop
1. Extracting Files extracting Reuters
$MAHOUT org.apache.lucene.benchmark.utils.ExtractReuters ${work_dir} /REUTERS-SGM ${work_dir} /reuters-out
2. Convert to Sequence (easy to read and write in Hadop)
$MAHOUT seqdirectory-i ${work_dir} /reuters-out-o ${work_dir} /reuters-out-seqdir-c UTF-8-chunk 64-xm sequential
3. Convert vector vectors
such as converting to sparse vectors
| $MAHOUT seq2sparse \ |
|
-I ${work_dir}/reuters-out-seqdir/\ |
|
-O ${work_dir}/reuters-out-seqdir-sparse-lda-ow--maxdfpercent--namedvector \ |
4. Then select the appropriate clustering algorithm to run
I chose LDA clustering for testing, and LDA is an algorithm similar to Dirichlet, all from an initial model, through the constant fitting model to achieve the goal
The entire distribution system, running for about 10 minutes,
Using the cluster-reuters.sh clustering using your own