ClueWeb09_Anchors (anchor text derived from CMU's ClueWeb09 web crawl)
Djoerd Hiemstra

ClueWeb09_Anchors (132 files)
part-00000.gz 184.07MB
part-00001.gz 186.02MB
part-00002.gz 184.80MB
part-00003.gz 187.49MB
part-00004.gz 184.18MB
part-00005.gz 185.95MB
part-00006.gz 185.23MB
part-00007.gz 185.76MB
part-00008.gz 186.03MB
part-00009.gz 185.21MB
part-00010.gz 185.05MB
part-00011.gz 184.53MB
part-00012.gz 184.93MB
part-00013.gz 184.86MB
part-00014.gz 184.68MB
part-00015.gz 184.57MB
part-00016.gz 185.33MB
part-00017.gz 185.12MB
part-00018.gz 184.79MB
part-00019.gz 185.23MB
part-00020.gz 184.97MB
part-00021.gz 185.76MB
part-00022.gz 184.90MB
part-00023.gz 185.13MB
part-00024.gz 185.58MB
part-00025.gz 185.24MB
part-00026.gz 184.78MB
part-00027.gz 184.44MB
part-00028.gz 184.90MB
part-00029.gz 184.87MB
part-00030.gz 184.06MB
part-00031.gz 184.70MB
part-00032.gz 185.82MB
part-00033.gz 185.78MB
part-00034.gz 184.62MB
part-00035.gz 186.01MB
part-00036.gz 185.09MB
part-00037.gz 184.83MB
part-00038.gz 184.95MB
part-00039.gz 184.68MB
part-00040.gz 185.74MB
part-00041.gz 185.01MB
part-00042.gz 184.82MB
part-00043.gz 185.09MB
part-00044.gz 185.53MB
part-00045.gz 185.69MB
part-00046.gz 184.27MB
part-00047.gz 185.42MB
part-00048.gz 184.82MB
part-00049.gz 186.13MB
part-00050.gz 185.25MB
part-00051.gz 185.37MB
part-00052.gz 184.64MB
part-00053.gz 184.56MB
part-00054.gz 185.00MB
part-00055.gz 185.52MB
part-00056.gz 185.02MB
part-00057.gz 185.13MB
part-00058.gz 186.07MB
part-00059.gz 186.33MB
part-00060.gz 185.10MB
part-00061.gz 184.88MB
part-00062.gz 185.02MB
part-00063.gz 185.05MB
part-00064.gz 185.74MB
part-00065.gz 184.57MB
part-00066.gz 184.90MB
part-00067.gz 186.31MB
part-00068.gz 185.41MB
part-00069.gz 184.81MB
part-00070.gz 185.01MB
part-00071.gz 184.94MB
part-00072.gz 185.78MB
part-00073.gz 190.04MB
part-00074.gz 190.62MB
part-00075.gz 184.56MB
part-00076.gz 185.18MB
part-00077.gz 185.72MB
part-00078.gz 184.81MB
part-00079.gz 184.97MB
part-00080.gz 184.96MB
part-00081.gz 184.91MB
part-00082.gz 185.47MB
part-00083.gz 185.43MB
part-00084.gz 185.11MB
part-00085.gz 184.20MB
part-00086.gz 186.81MB
part-00087.gz 184.69MB
part-00088.gz 184.78MB
part-00089.gz 184.56MB
part-00090.gz 184.63MB
part-00091.gz 185.35MB
part-00092.gz 185.67MB
part-00093.gz 185.42MB
part-00094.gz 187.29MB
part-00095.gz 184.77MB
part-00096.gz 185.00MB
part-00097.gz 185.61MB
part-00098.gz 185.13MB
part-00099.gz 185.70MB
part-00100.gz 184.54MB
part-00101.gz 185.84MB
part-00102.gz 185.18MB
part-00103.gz 185.38MB
part-00104.gz 185.67MB
part-00105.gz 185.14MB
part-00106.gz 185.94MB
part-00107.gz 184.96MB
part-00108.gz 185.00MB
part-00109.gz 184.87MB
part-00110.gz 185.16MB
part-00111.gz 184.78MB
part-00112.gz 184.95MB
part-00113.gz 184.99MB
part-00114.gz 184.62MB
part-00115.gz 185.54MB
part-00116.gz 184.43MB
part-00117.gz 185.33MB
part-00118.gz 185.17MB
part-00119.gz 185.58MB
part-00120.gz 184.24MB
part-00121.gz 185.53MB
part-00122.gz 184.74MB
part-00123.gz 187.35MB
part-00124.gz 184.85MB
part-00125.gz 185.40MB
part-00126.gz 184.97MB
part-00127.gz 185.27MB
part-00128.gz 185.23MB
part-00129.gz 185.22MB
part-00130.gz 184.87MB
part-00131.gz 185.91MB
Type: Dataset
Tags: web, ClueWeb, HTML, CMU, Twente, anchors, TREC

title= {ClueWeb09_Anchors (anchor text derived from CMU's ClueWeb09 web crawl)},
journal= {Technical Report TR-CTIT-10-15, Centre for Telematics and Information Technology University of Twente, Enschede. ISSN 1381-3625},
author= {Djoerd Hiemstra},
year= {2010},
url= {},
license= {},
abstract= {Anchor texts extracted from ClueWeb09},
keywords= {web, ClueWeb, HTML, CMU, Twente, anchors, TREC},
terms= {},
superseded= {}