123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398 |
- <?xml version="1.0"?>
- <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
- <!-- Do not modify this file directly. Instead, copy entries that you -->
- <!-- wish to modify from this file into hadoop-site.xml and change them -->
- <!-- there. If hadoop-site.xml does not already exist, create it. -->
- <configuration>
- <!--- logging properties -->
- <property>
- <name>hadoop.logfile.size</name>
- <value>10000000</value>
- <description>The max size of each log file</description>
- </property>
- <property>
- <name>hadoop.logfile.count</name>
- <value>10</value>
- <description>The max number of log files</description>
- </property>
- <property>
- <name>dfs.namenode.logging.level</name>
- <value>info</value>
- <description>The logging level for dfs namenode. Other values are "dir"(trac
- e namespace mutations), "block"(trace block under/over replications and block
- creations/deletions), or "all".</description>
- </property>
- <!-- i/o properties -->
- <property>
- <name>io.sort.factor</name>
- <value>10</value>
- <description>The number of streams to merge at once while sorting
- files. This determines the number of open file handles.</description>
- </property>
- <property>
- <name>io.sort.mb</name>
- <value>100</value>
- <description>The total amount of buffer memory to use while sorting
- files, in megabytes. By default, gives each merge stream 1MB, which
- should minimize seeks.</description>
- </property>
- <property>
- <name>io.file.buffer.size</name>
- <value>4096</value>
- <description>The size of buffer for use in sequence files.
- The size of this buffer should probably be a multiple of hardware
- page size (4096 on Intel x86), and it determines how much data is
- buffered during read and write operations.</description>
- </property>
-
- <property>
- <name>io.bytes.per.checksum</name>
- <value>512</value>
- <description>The number of bytes per checksum. Must not be larger than
- io.file.buffer.size.</description>
- </property>
- <property>
- <name>io.skip.checksum.errors</name>
- <value>false</value>
- <description>If true, when a checksum error is encountered while
- reading a sequence file, entries are skipped, instead of throwing an
- exception.</description>
- </property>
-
- <property>
- <name>io.map.index.skip</name>
- <value>0</value>
- <description>Number of index entries to skip between each entry.
- Zero by default. Setting this to values larger than zero can
- facilitate opening large map files using less memory.</description>
- </property>
- <!-- file system properties -->
- <property>
- <name>fs.default.name</name>
- <value>local</value>
- <description>The name of the default file system. Either the
- literal string "local" or a host:port for DFS.</description>
- </property>
- <property>
- <name>dfs.datanode.port</name>
- <value>50010</value>
- <description>The port number that the dfs datanode server uses as a starting
- point to look for a free port to listen on.
- </description>
- </property>
- <property>
- <name>dfs.info.port</name>
- <value>50070</value>
- <description>The base port number for the dfs namenode web ui.
- </description>
- </property>
- <property>
- <name>dfs.datanode.du.reserved</name>
- <value>0</value>
- <description>Reserved space in bytes. Always leave this much space free for non dfs use
- </description>
- </property>
- <property>
- <name>dfs.datanode.du.pct</name>
- <value>0.98f</value>
- <description>When calculating remaining space, only use this percentage of the real available space
- </description>
- </property>
- <property>
- <name>dfs.name.dir</name>
- <value>/tmp/hadoop/dfs/name</value>
- <description>Determines where on the local filesystem the DFS name node
- should store the name table.</description>
- </property>
- <property>
- <name>dfs.data.dir</name>
- <value>/tmp/hadoop/dfs/data</value>
- <description>Determines where on the local filesystem an DFS data node
- should store its blocks. If this is a comma-delimited
- list of directories, then data will be stored in all named
- directories, typically on different devices.
- Directories that do not exist are ignored.
- </description>
- </property>
- <property>
- <name>dfs.replication</name>
- <value>3</value>
- <description>Default block replication.
- The actual number of replications can be specified when the file is created.
- The default is used if replication is not specified in create time.
- </description>
- </property>
- <property>
- <name>dfs.replication.max</name>
- <value>512</value>
- <description>Maximal block replication.
- </description>
- </property>
- <property>
- <name>dfs.replication.min</name>
- <value>1</value>
- <description>Minimal block replication.
- </description>
- </property>
- <property>
- <name>dfs.block.size</name>
- <value>67108864</value>
- <description>The default block size for new files.</description>
- </property>
- <property>
- <name>dfs.df.interval</name>
- <value>3000</value>
- <description>Disk usage statistics refresh interval in msec.</description>
- </property>
- <property>
- <name>dfs.client.block.write.retries</name>
- <value>3</value>
- <description>The number of retries for writing blocks to the data nodes,
- before we signal failure to the application.
- </description>
- </property>
- <!-- map/reduce properties -->
- <property>
- <name>mapred.job.tracker</name>
- <value>local</value>
- <description>The host and port that the MapReduce job tracker runs
- at. If "local", then jobs are run in-process as a single map
- and reduce task.
- </description>
- </property>
- <property>
- <name>mapred.job.tracker.info.port</name>
- <value>50030</value>
- <description>The port that the MapReduce job tracker info webserver runs at.
- </description>
- </property>
- <property>
- <name>mapred.task.tracker.output.port</name>
- <value>50040</value>
- <description>The port number that the MapReduce task tracker output server uses as a starting
- point to look for a free port to listen on.
- </description>
- </property>
- <property>
- <name>mapred.task.tracker.report.port</name>
- <value>50050</value>
- <description>The port number that the MapReduce task tracker report server uses as a starting
- point to look for a free port to listen on.
- </description>
- </property>
- <property>
- <name>mapred.local.dir</name>
- <value>/tmp/hadoop/mapred/local</value>
- <description>The local directory where MapReduce stores intermediate
- data files. May be a comma-separated list of
- directories on different devices in order to spread disk i/o.
- Directories that do not exist are ignored.
- </description>
- </property>
- <property>
- <name>mapred.system.dir</name>
- <value>/tmp/hadoop/mapred/system</value>
- <description>The shared directory where MapReduce stores control files.
- </description>
- </property>
- <property>
- <name>mapred.temp.dir</name>
- <value>/tmp/hadoop/mapred/temp</value>
- <description>A shared directory for temporary files.
- </description>
- </property>
- <property>
- <name>mapred.local.dir.minspacestart</name>
- <value>0</value>
- <description>If the space in mapred.local.dir drops under this,
- do not ask for more tasks.
- Value in bytes.
- </description>
- </property>
- <property>
- <name>mapred.local.dir.minspacekill</name>
- <value>0</value>
- <description>If the space in mapred.local.dir drops under this,
- do not ask more tasks until all the current ones have finished and
- cleaned up. Also, to save the rest of the tasks we have running,
- kill one of them, to clean up some space. Start with the reduce tasks,
- then go with the ones that have finished the least.
- Value in bytes.
- </description>
- </property>
- <property>
- <name>mapred.map.tasks</name>
- <value>2</value>
- <description>The default number of map tasks per job. Typically set
- to a prime several times greater than number of available hosts.
- Ignored when mapred.job.tracker is "local".
- </description>
- </property>
- <property>
- <name>mapred.reduce.tasks</name>
- <value>1</value>
- <description>The default number of reduce tasks per job. Typically set
- to a prime close to the number of available hosts. Ignored when
- mapred.job.tracker is "local".
- </description>
- </property>
- <property>
- <name>mapred.reduce.parallel.copies</name>
- <value>5</value>
- <description>The default number of parallel transfers run by reduce
- during the copy(shuffle) phase.
- </description>
- </property>
- <property>
- <name>mapred.task.timeout</name>
- <value>600000</value>
- <description>The number of milliseconds before a task will be
- terminated if it neither reads an input, writes an output, nor
- updates its status string.
- </description>
- </property>
- <property>
- <name>mapred.tasktracker.tasks.maximum</name>
- <value>2</value>
- <description>The maximum number of tasks that will be run
- simultaneously by a task tracker.
- </description>
- </property>
- <property>
- <name>mapred.child.java.opts</name>
- <value>-Xmx200m</value>
- <description>Java opts for the task tracker child processes. Subsumes
- 'mapred.child.heap.size' (If a mapred.child.heap.size value is found
- in a configuration, its maximum heap size will be used and a warning
- emitted that heap.size has been deprecated). Also, the following symbols,
- if present, will be interpolated: @taskid@ is replaced by current TaskID;
- and @port@ will be replaced by mapred.task.tracker.report.port + 1 (A second
- child will fail with a port-in-use if mapred.tasktracker.tasks.maximum is
- greater than one). Any other occurrences of '@' will go unchanged. For
- example, to enable verbose gc logging to a file named for the taskid in
- /tmp and to set the heap maximum to be a gigabyte, pass a 'value' of:
- -Xmx1024m -verbose:gc -Xloggc:/tmp/@taskid@.gc
- </description>
- </property>
- <property>
- <name>mapred.combine.buffer.size</name>
- <value>100000</value>
- <description>The number of entries the combining collector caches before
- combining them and writing to disk.</description>
- </property>
- <property>
- <name>mapred.speculative.execution</name>
- <value>true</value>
- <description>If true, then multiple instances of some map tasks may
- be executed in parallel.</description>
- </property>
- <property>
- <name>mapred.min.split.size</name>
- <value>0</value>
- <description>The minimum size chunk that map input should be split
- into. Note that some file formats may have minimum split sizes that
- take priority over this setting.</description>
- </property>
- <property>
- <name>mapred.submit.replication</name>
- <value>10</value>
- <description>The replication level for submitted job files. This
- should be around the square root of the number of nodes.
- </description>
- </property>
- <property>
- <name>tasktracker.http.threads</name>
- <value>40</value>
- <description>The number of worker threads that for the http server. This is
- used for map output fetching
- </description>
- </property>
- <property>
- <name>tasktracker.http.port</name>
- <value>50060</value>
- <description>The default port for task trackers to use as their http server.
- </description>
- </property>
- <!-- ipc properties -->
- <property>
- <name>ipc.client.timeout</name>
- <value>60000</value>
- <description>Defines the timeout for IPC calls in milliseconds.</description>
- </property>
- <property>
- <name>ipc.client.idlethreshold</name>
- <value>4000</value>
- <description>Defines the threshold numner of connections after which
- connections will be inspected for idleness.
- </description>
- </property>
- <property>
- <name>ipc.client.maxidletime</name>
- <value>120000</value>
- <description>Defines the maximum idle time for a connected client after
- which it may be disconnected.
- </description>
- </property>
- <property>
- <name>ipc.client.kill.max</name>
- <value>10</value>
- <description>Defines the maximum number of clients to disconnect in one go.
- </description>
- </property>
- </configuration>
|