hahsler.bib

@article{hahsler:Drew2017,
  author = {Jake Drew and Michael Hahsler and Tyler Moore},
  title = {Polymorphic malware detection using sequence classification methods},
  journal = {EURASIP Journal on Information Security},
  year = {2017},
  volume = {2017},
  pages = {1--12},
  number = {1},
  month = {January},
  doi = {10.1186/s13635-017-0055-6},
  url = {http://dx.doi.org/10.1186/s13635-017-0055-6},
  abstract = {Identifying malicious software executables is made difficult by the constant adaptations introduced by miscreants  in order to evade detection by antivirus software.
   Such changes are akin to mutations in biological sequences. Recently, high-throughput methods for gene sequence classification have been developed by the bioinformatics and computational biology communities. In this paper, we apply methods designed for gene sequencing to detect malware in a manner robust to attacker adaptations. Whereas most gene classification tools are optimized for and restricted to an alphabet of four letters (nucleic acids), we have selected the {\em Strand} gene sequence classifier for malware classification. Strand's design can easily accommodate unstructured data with any alphabet, including source code or compiled machine code. To demonstrate that gene sequence classification tools are suitable for classifying malware, we apply Strand to approximately 500GB of malware data provided by the Kaggle Microsoft Malware Classification Challenge (BIG 2015) used for predicting 9 classes of polymorphic malware. Experiments show that, with minimal adaptation, the method achieves accuracy levels well above 95\% requiring only a fraction of the training times used by the winning team's method.},
  category = {other}
}
@article{hahsler:Gharibi2016,
  author = {Zahra Gharibi and Mehmet Ayvaci and Michael Hahsler and Tracy Giacoma and Robert S. Gaston and Bekir Tanriover},
  title = {Cost-Effectiveness of Antibody-Based Induction Therapy in Deceased Donor Kidney Transplantation in the United States},
  journal = {Transplantation},
  year = {2016},
  volume = {},
  pages = {},
  number = {},
  month = {},
  abstract = {Induction therapy in deceased donor kidney transplantation is
    costly, with wide discrepancy in utilization and a limited evidence base,
    particularly regarding cost-effectiveness.
    METHODS: We linked the United States Renal Data System data set 
    to Medicare claims to
    estimate cumulative costs, graft survival, and incremental 
    cost-effectiveness ratio (ICER - cost per additional year of graft 
    survival) within 3 years of
    transplantation in 19 450 deceased donor kidney transplantation recipients
    with Medicare as primary payer from 2000 to 2008. We divided the 
    study cohort into high-risk (age > 60 years, panel-reactive 
    antibody > 20\%, African
    American race, Kidney Donor Profile Index > 50\%, cold ischemia time >
    24 hours) and low-risk (not having any risk factors, comprising
    approximately 15\% of the cohort). After the elimination of
    dominated options, we estimated expected ICER among induction
    categories: no-induction, alemtuzumab, rabbit antithymocyte globulin
    (r-ATG), and interleukin-2 receptor-antagonist.
    RESULTS: No-induction was the least effective and most costly 
    option in both risk groups. Depletional antibodies (r-ATG and alemtuzumab) 
    were more cost-effective across all willingness-to-pay thresholds in 
    the low-risk group. For the high-risk group and its subcategories, 
    the ICER was very sensitive to the graft survival; overall both 
    depletional antibodies were more cost-effective, mainly for higher 
    willingness to pay threshold (US \$100 000 and US \$150 000). Rabbit 
    ATG appears to achieve excellent cost-effectiveness acceptability 
    curves (80\% of the recipients) in both risk groups at US \$50 000 
    threshold (except age > 60 years). In addition, only r-ATG was 
    associated with graft survival benefit over no-induction category 
    (hazard ratio, 0.91; 95\% confidence interval, 0.84-0.99) in a 
    multivariable Cox regression analysis.
    CONCLUSIONS: Antibody-based induction appears to offer substantial 
    advantages in both cost and outcome compared with no-induction. Overall, 
    depletional induction (preferably r-ATG) appears to offer the 
    greatest benefits.},
  doi = {10.1097/TP.0000000000001310},
  url = {https://www.ncbi.nlm.nih.gov/pubmed/27379555},
  category = {healthcare}
}
@inproceedings{hahsler:Hahsler2016e,
  author = {Michael Hahsler},
  title = {Grouping Association Rules Using Lift},
  booktitle = {11th INFORMS Workshop on Data Mining and Decision Analytics (DM-DA 2016)},
  year = {2016},
  month = {November},
  pages = {},
  location = {Nashville, TN, USA},
  date = {November 12, 2016},
  publisher = {},
  editor = {C. Iyigun and R. Moghaddess and A. Oztekin},
  abstract = {
    Association rule mining is a well established and popular data mining
method for finding local dependencies between items in large transaction
databases.  However, a practical drawback of mining and efficiently using
association rules is that the set of rules returned by the mining algorithm is
typically too large to be directly used.  Clustering association rules into a
small number of meaningful groups would be valuable for experts who need to
manually inspect the rules, for visualization and as the input for other
applications.  Interestingly, clustering is not widely used as a standard
method to summarize large sets of associations. In fact it performs poorly due
to high dimensionality, the inherent extreme data sparseness and the dominant
frequent itemset structure reflected in sets of association rules.  In this
paper, we review association rule clustering and their shortcomings. We then
propose a simple approach based on grouping columns in a lift matrix and give
an example to illustrate its usefulness.
    },
  pdf = {http://michael.hahsler.net/research/misc/INFORMS2016_DM_clusteringAR.pdf},
  category = {association rules}
}
@article{hahsler:Hahsler2016d,
  author = {Michael Hahsler},
  title = {An Experimental Comparison of Seriation Methods For One-Mode 
      Two-Way Data},
  journal = {European Journal of Operational Research},
  year = {2017},
  volume = {257},
  pages = {133--143},
  number = {},
  month = {February},
  abstract = {Seriation aims at finding a linear order for a set of objects to reveal
  structural information which can be used for deriving data-driven
  decisions. It presents a difficult combinatorial optimization problem with
  its roots and applications in many fields including operations research.
  This paper focuses on a popular seriation problem which tries to find an
  order for a single set of objects that optimizes a given seriation
  criterion defined on one-mode two-way data, i.e., an object-by-object
  dissimilarity matrix.  Over the years, members of different research
  communities have introduced many criteria and seriation methods for this
  problem.  It is often not clear how different seriation criteria and
  methods relate to each other and which criterion or seriation method to use
  for a given application. These methods are represent tools for analytics
  and therefore are of theoretical and practical interest to the operations
  research community. The purpose of this paper is to provide a
  consistent overview of the most popular criteria and seriation methods and
  to present a comprehensive experimental study to compare their performance
  using artificial and a representative set of real-world datasets.},
  pdf = {http://michael.hahsler.net/research/misc/EJOR_seriation_2016.pdf},
  doi = {10.1016/j.ejor.2016.08.066},
  category = {optimization, seriation}
}
@article{hahsler:Shaiba2016,
  author = {Shaiba Hadil and Michael Hahsler},
  title = {A Comparison of Machine Learning Methods for Predicting Tropical Cyclone Rapid Intensification Events},
  journal = {Research Journal of Applied Sciences, Engineering and Technology},
  year = {2016},
  volume = {13},
  pages = {638--651},
  number = {8},
  abstract = {The aim of this study is to improve the intensity prediction of hurricanes by accounting for rapid
  intensification (RI) events. Modern machine learning methods offer much promise for predicting meteorological
  events. One application is providing timely and accurate predictions of tropical cyclone (TC) behavior, which is
  crucial for saving lives and reducing damage to property. Current TC track prediction models perform much better
  than intensity (wind speed) models. This is partially due to the existence of RI events. An RI event is defined as a
  sudden change in the maximum sustained wind speed of 30 knots or greater within 24 hours. Forecasting RI events
  is so important that it has been put on the National Hurricane Center top forecast priority list. The research on the
  use of machine learning methods for RI prediction is currently very limited. In this paper, we investigate the
  potential of popular machine learning methods to predict RI events. The evaluated models include support vector
  machines, logistic regression, naïve-Bayes classifiers, classification and regression trees and a wide range of
  ensemble methods including boosting and stacking. We also investigate dimensionality reduction and feature
  selection, and we address class imbalance using the Synthetic Minority Over-sampling Technique (SMOTE). The
  evaluation shows that some of the investigated models improve over the current operational Rapid Intensification
  Index model. Finally, we use RI predictions to make improved storm intensity predictions.},
  doi = {10.19026/rjaset.13.3050},
  url = {http://maxwellsci.com/jp/mspabstract.php?jid=RJASET&doi=rjaset.13.3050},
  category = {earth}
}
@article{hahsler:Hahsler2016c,
  author = {Michael Hahsler and Radoslaw Karpienko},
  title = {Visualizing Association Rules in Hierarchical Groups},
  journal = {Journal of Business Economics},
  year = {2016},
  pages = {1--19},
  month = {May},
  abstract = {
  Association rule mining is one of the most popular data mining methods.
  However, mining association rules often results
  in a very large number of found rules, leaving the analyst with the task to go
  through all the rules and discover interesting ones. Sifting manually through
  large sets of rules is time consuming and strenuous. Visualization has a long
  history of making large amounts of data better accessible using techniques like
  selecting and zooming. However, most association rule visualization techniques are still falling short when it comes to a large number of rules.
  In this paper we present an integrated framework for post-processing and visualization of association rules, which allows to intuitively explore and interpret highly complex scenarios.
  We demonstrate how this framework can be used to analyze large sets of association rules using the R software for statistical computing, and provide
  examples from the implementation in the R-package arulesViz.
  },
  doi = {10.1007/s11573-016-0822-8},
  pdf = {http://link.springer.com/content/pdf/10.1007%2Fs11573-016-0822-8.pdf},
  category = {association rules, visualization, marketing}
}
@inproceedings{hahsler:Drew2016,
  author = {Jake Drew and Michael Hahsler and Tyler Moore},
  title = {Polymorphic Malware Detection Using Sequence Classification Methods},
  booktitle = {International Workshop on Bio-inspired Security, Trust, Assurance and Resilience (BioSTAR 2016)},
  year = {2016},
  month = {May},
  pages = {},
  location = {San Jose, CA, USA},
  date = {May 26, 2016},
  publisher = {},
  editor = {},
  abstract = {Polymorphic malware detection is challenging due
    to the continual mutations miscreants introduce to successive
    instances of a particular virus. Such changes are akin to 
    mutations in biological sequences. Recently, high-throughput methods
    for gene sequence classification have been developed by the
    bioinformatics and computational biology communities. In this
    paper, we argue that these methods can be usefully applied to
    malware detection. Unfortunately, gene classification tools are
    usually optimized for and restricted to an alphabet of four letters
    (nucleic acids). Consequently, we have selected the Strand gene
    sequence classifier, which offers a robust classification strategy
    that can easily accommodate unstructured data with any alphabet
    including source code or compiled machine code. To demonstrate
    Stand's suitability for classifying malware, we execute it on
    approximately 500GB of malware data provided by the Kaggle
    Microsoft Malware Classification Challenge (BIG 2015) used for
    predicting 9 classes of polymorphic malware. Experiments show
    that, with minimal adaptation, the method achieves accuracy
    levels well above 95\% requiring only a fraction of the training
    times used by the winning team's method.},
  pdf = {http://michael.hahsler.net/research/misc/Biostar_2016_polymorphic-malware-detection.pdf},
  category = {other}
}
@article{hahsler:Hahsler2016b,
  author = {Michael Hahsler and Matthew Bola{\~n}os},
  title = {Clustering Data Streams Based on Shared Density Between Micro-Clusters},
  journal = {IEEE Transactions on Knowledge and Data Engineering},
  year = {2016},
  volume = {28},
  number = {6},
  pages = {1449--1461},
  month = {June},
  doi = {10.1109/TKDE.2016.2522412},
  issn = {1041-4347},
  abstract = {As more and more applications produce streaming data,
  clustering data streams has become an important technique for data and
  knowledge engineering.  A typical approach is to summarize the data
  stream in real-time with an online process into a large number of so
  called micro-clusters.  Micro-clusters represent local density estimates
  by aggregating the information of many data points in a defined area.  On
  demand, a (modified) conventional clustering algorithm is used in a
  second offline step to recluster the micro-clusters into larger final
  clusters.  For reclustering, the centers of the micro-clusters are used
  as pseudo points with the density estimates used as their weights.
  However, information about density in the area between micro-clusters is
  not preserved in the online process and reclustering is based on possibly
  inaccurate assumptions about the distribution of data within and between
  micro-clusters (e.g., uniform or Gaussian).
  This paper describes DBSTREAM, the first micro-cluster-based online
  clustering component that explicitly captures the density between
  micro-clusters via a shared density graph.  The density information in this
  graph is then exploited for reclustering based on actual density between
  adjacent micro-clusters.  We discuss the space and time complexity of
  maintaining the shared density graph.  Experiments on a wide range of
  synthetic and real data sets highlight that using shared density improves
  clustering quality over other popular data stream clustering methods which
  require the creation of a larger number of smaller micro-clusters to achieve
  comparable results.},
  pdf = {http://michael.hahsler.net/research/misc/TKDE_2016_shared_density.pdf},
  category = {stream mining}
}
@article{hahsler:Hahsler2016,
  author = {Michael Hahsler and Matthew Bola{\~n}os and John Forrest},
  title = {stream: An Extensible Framework for Data Stream Clustering Research with {R}},
  journal = {Journal of Statistical Software},
  year = {2017},
  volume = {76},
  pages = {1--52},
  number = {14},
  month = {February},
  abstract = {In recent years, data streams have become an increasingly
   important area of research for the computer science, database and
   statistics communities. Data streams are ordered and potentially
   unbounded sequences of data points created by a typically
   non-stationary data generating process.  Common data mining tasks
   associated with data streams include clustering, classification and
   frequent pattern mining. New algorithms for these types of data are
   proposed regularly and it is important to evaluate them thoroughly
   under standardized conditions.
   In this paper we introduce stream, a research tool that includes modeling and
   simulating data streams as well as an extensible framework for implementing,
   interfacing and experimenting with algorithms for various data stream mining
   tasks.  The main advantage of stream is that it seamlessly integrates
   with the large existing infrastructure provided by R.  In
   addition to data handling, plotting and easy scripting capabilities,
   R also provides many existing algorithms and enables users to
   interface code written in many programming languages popular among data
   mining researchers (e.g., C/C++, Java and Python).  In this paper we
   describe the architecture of stream and focus on its use for data stream
   clustering research.  stream was implemented with extensibility in mind
   and will be extended in the future to cover additional data stream mining
   tasks like classification and frequent pattern mining.},
  issn = {1548-7660},
  nopdf = {http://michael.hahsler.net/research/misc/JSS_2016_stream.pdf},
  pdf = {https://www.jstatsoft.org/index.php/jss/article/view/v076i14/v76i14.pdf},
  doi = {10.18637/jss.v076.i14},
  category = {stream mining}
}
@inproceedings{hahsler:Mokhtarpour2016,
  author = {Becca Mokhtarpour and Jerrell T. Stracener and Michael Hahsler},
  title = {A Data-Analysis Approach For Improved Decision-Making In Selecting the Preferred {SoS} Capability Solution},
  booktitle = {2016 Conference on Systems Engineering Research},
  year = {2016},
  month = {March},
  pages = {},
  location = {Huntsville, AL, USA},
  date = {March 22--24, 2016},
  publisher = {},
  editor = {},
  abstract = {A system of systems (SoS) approach for providing a new capability has been gaining increased attention, and consequently resulted in numerous research activities. However, decision-making methods and frameworks are still lacking in this area.  In this paper, a general method is presented for selecting the preferred SoS solution within a large and complex solution space. The method presented in this paper combines statistical data analysis and ranking methods to screen a large number of feasible SoS solutions in a high-dimensional space and present the best SoS solution to the stakeholders. This method reduces the order of the solution space to a manageable level to enable communication among stakeholders. The advantage of the method is in increased participation of stakeholders by providing a smaller palette of solutions that can be effectively negotiated and compared in terms of various estimated decision factors. The effectiveness of this method is demonstrated through a hypothetical case of a search-and-rescue mission.},
  category = {other}
}
@inproceedings{hahsler:Chelluboina2015,
  author = {Chelluboina, Sudheer and Hahsler, Michael},
  title = {Trajectory Segmentation Using Oblique Envelopes},
  booktitle = {2015 IEEE International Conference on Information Reuse and Integration (IRI)},
  year = {2015},
  month = {August},
  pages = {470--475},
  location = {San Francisco, CA, USA},
  date = {August 13--15, 2015},
  publisher = {IEEE},
  editor = {},
  doi = {http://dx.doi.org/10.1109/IRI.2015.78},
  abstract = {Trajectory segmentation, i.e., breaking the trajectory into sub-trajectories, is a fundamental task needed for many applications dealing with moving objects. Several methods for trajectory segmentation, e.g., based on minimum description length (MDL), have been proposed. In this paper, we develop a novel technique for trajectory segmentation which created a series of oblique envelopes to partition the trajectory into sub-trajectories. Experiments with the new algorithm on hurricane trajectory data, taxi GPS data and simulated data of tracks show that oblique envelopes out-perform MDL-based trajectory segmentation.},
  category = {other}
}
@inproceedings{hahsler:Nagar2015,
  author = {Anurag Nagar and Michael Hahsler and Hisham Al-Mubaid},
  title = {Association Rule Mining of Gene Ontology Annotation Terms for {SGD}},
  booktitle = {2015 IEEE Conference on Computational Intelligence in Bioinformatics and Computational Biology (CIBCB)},
  year = {2015},
  month = {August},
  pages = {},
  location = {Niagara Falls, Canada},
  date = {August 12--15, 2015},
  publisher = {IEEE},
  editor = {},
  abstract = {Gene Ontology is one of the largest bioinformatics project that
	seeks to consolidate knowledge about genes through annotation of terms
	to three ontologies. In this work, we present a technique to find
	association relationships in the annotation terms for the
	Saccharomyces cerevisiae (SGD) genome. We first present a
	normalization algorithm to ensure that the annotation terms have a
	similar level of specificity. Association rule mining algorithms
	are used to find significant and non-trivial association rules in
	these normalized datasets. Metrics such as support, confidence, and
	lift can be used to evaluate the strength of found rules. We
	conducted experiments on the entire SGD annotation dataset and here
	we present the top 10 strongest rules for each of the three
	ontologies. We verify the found rules using evidence from the
	biomedical literature. The presented method has a number of
	advantages - it relies only on the structure of the gene ontology,
	has minimal memory and storage requirements, and can be easily scaled
	for large genomes, such as the human genome. There are many
	applications of this technique, such as predicting the GO
	annotations for new genes or those that have not been
	studied extensively.},
  pdf = {http://michael.hahsler.net/research/misc/CIBCI_2015_AssociationRuleMiningofGOA.pdf},
  doi = {10.1109/CIBCB.2015.7300289},
  category = {association rules, bioinformatics}
}
@inproceedings{hahsler:Lassig2015,
  author = {J{\"o}rg L{\"a}ssig and Michael Hahsler},
  title = {Cooperative Data Analysis in Supply Chains Using Selective Information Disclosure},
  booktitle = {Operations Research and Computing: Algorithms and Software for Analytics, 14th INFORMS Computing Society Conference (ICS2015)},
  year = {2015},
  month = {January},
  pages = {},
  location = {Richmond, VA},
  date = {January 11--13, 2015},
  publisher = {INFORMS},
  editor = {Brian Borchers and J. Paul Brooks and Laura McLay},
  abstract = {Many modern products (e. g., consumer electronics) consist of
	hundreds of complex parts sourced from a large number of suppliers. In
	    such a setting, finding the source of certain properties, e. g.,
	the source of defects in the final product, becomes increasingly
	    difficult. Data analysis methods can be used on information shared
	    in modern supply chains. However, some information may be
	    confidential since they touch proprietary production processes or
	    trade secrets. Important principles of confidentiality are data
	    minimization and that each participant has control over how much
	    information is communicated with others, both of which makes data
	    analysis more difficult.  In this work, we investigate the
	    effectiveness of strategies for selective information disclosure in
	    order to perform cooperative data analysis in a supply chain. The
	    goal is to minimize information exchange by only exchanging
	    information which is needed for the analysis tasks at hand. The
	    work is motivated by the growing demand for cross company data
	    analysis, while simultaneously addressing confidentiality concerns.
	    As an example, we apply a newly developed protocol with association
	    mining techniques in an empirical simulation study to compare its
	    effectiveness with complete information disclosure. The results
	    show that the predictive performance is comparable while
    the amount of exchanged information is reduced significantly.},
  url = {https://www.informs.org/content/download/296126/2820277/file/ics.2015.0019-pp245-256.pdf},
  category = {association rules}
}
@inproceedings{hahsler:Drew2014b,
  author = {Jake Drew and Michael Hahsler},
  title = {Practical Applications of Locality Sensitive Hashing for Unstructured Data},
  booktitle = {Proceedings of the 2014 CMG Conference: Performance and Capacity},
  year = {2014},
  month = {November},
  pages = {},
  location = {Atlanta, GA},
  date = {November 3--6, 2014},
  publisher = {CMG},
  editor = {},
  abstract = {Working with large amounts of unstructured data (e.g., text
	documents) has become important for many business, engineering and
	scientific applications. The purpose of this article is to demonstrate
	how the practical Data Scientist can implement a Locality Sensitive
	Hashing system from start to finish in order to drastically reduce the
	time required to perform a similarity search in high dimensional space
	(e.g., created by the terms in the vector space model for documents).
	Locality Sensitive Hashing dramatically reduces the amount of data
	required for storage and comparison by applying probabilistic
	dimensionality reduction. In this paper we concentrate on the
	implementation of min-wise independent permutations (MinHashing) which
	provides an efficient way to determine an accurate approximation of the
	Jaccard similarity coefficient between sets (e.g., sets of terms in
	documents).},
  pdf = {http://michael.hahsler.net/research/misc/CMG_2014_LSH.pdf},
  category = {bioinformatics}
}
@inproceedings{hahsler:Drew2014,
  author = {Jake Drew and Michael Hahsler},
  title = {Strand: Fast Sequence Comparison using MapReduce and Locality Sensitive Hashing},
  booktitle = {Proceedings of the ACM Conference on Bioinformatics, Computational Biology and Health Informatics (BCB 2014)},
  year = {2014},
  month = {September},
  pages = {},
  location = {Newport Beach, CA},
  date = {September 20--23, 2014},
  publisher = {ACM},
  editor = {},
  abstract = {The Super Threaded Reference-Free Alignment-Free 
    N-sequence Decoder (Strand) is a highly parallel technique for
    the learning and classification of gene sequence data into
    any number of associated categories or gene sequence taxonomies. 
    Current methods, including the state-of-the-art
    sequence classification method RDP, balance performance
    by using a shorter word length. Strand in contrast uses a
    much longer word length, and does so efficiently by implementing 
    a Divide and Conquer algorithm leveraging MapReduce 
    style processing and locality sensitive hashing. Strand
    is able to learn gene sequence taxonomies and classify new
    sequences approximately 20 times faster than the RDP classifier 
    while still achieving comparable accuracy results. This
    paper compares the accuracy and performance characteristics 
	of Strand against RDP using 16S rRNA sequence data
    from the RDP training dataset and the Greengenes sequence
    repository.},
  pdf = {http://michael.hahsler.net/research/misc/ACM_BCB_2014_Strand.pdf},
  doi = {10.1145/2649387.2649436},
  category = {bioinformatics}
}
@inproceedings{hahsler:Shaiba2014,
  author = {Hadil Shaiba and Michael Hahsler},
  title = {An Experimental Comparison of Different
    Classifiers for Predicting Tropical Cyclone
    Rapid Intensification Events},
  booktitle = {Proceedings of the International Conference on Machine Learning, Electrical and Mechanical Engineering (ICMLEME'2014), Dubai, UAE},
  year = {2014},
  month = {January},
  pages = {},
  editor = {},
  pdf = {http://michael.hahsler.net/research/misc/IIE2014_RII.pdf},
  abstract = {Accurately predicting the intensity and track of a
    Tropical Cyclone (TC) can save the lives of people and help to
    significantly reduce damage to property and infrastructure. Current
    track prediction models outperform intensity models which is
    partially due to the existence of rapid intensification (RI) events. RI
    appears in the lifecycle of most major hurricanes and can be defined
    as a change in intensity within 24 hours which exceeds 30 knots.
    Improving the predicting of RI events has been identified as one of
    the top priority problems by the National Hurricane Center (NHC). In
    this paper we compare the RI event prediction performance of several
    popular classification methods: Logistic regression, naive Bayes
    classifier, classification and regression tree (CART), and support
    vector machine. The dataset used is derived from the data used by the
    Statistical Hurricane Intensity Prediction Scheme (SHIPS) model for
    intensity prediction which contains large-scale weather, ocean, and
    earth condition predictors from 1982 to 2011. 10-fold cross
    validation is applied to compare the models. The probability of
    detection (POD) and false alarm ratio (FAR) are used to measure
    performance. Predicting RI events is a difficult problem but initial
    experiments show potential for improving forecast using data mining
    and machine learning techniques.},
  category = {earth}
}
@inproceedings{hahsler:NagarBCB2013,
  author = {Anurag Nagar and Michael Hahsler},
  title = {Genomic Sequence Fragment Identification using Quasi-Alignment},
  booktitle = {Proceedings of the ACM BCB Conference 2013, Washington D.C.},
  year = {2013},
  month = {September},
  pages = {},
  editor = {},
  pdf = {http://michael.hahsler.net/research/misc/ACM_BCB_2013_Fragments.pdf},
  doi = {10.1145/2506583.2506647},
  abstract = {Identification of organisms using their genetic sequences is a
	popular problem in molecular biology and is used in fields such as
	    metagenomics, molecular phylogenetics and DNA Barcoding. These
	    applications depend on searching large sequence databases for
	    individual matching sequences (e.g., with BLAST) and comparing
	    sequences using multiple sequence alignment (e.g., via Clustal),
	both of which are computationally expensive and require extensive
	    server resources. We propose a novel method for sequence
	    comparison, analysis, and classification which avoids the need to
	    align sequences at the base level or search a database for
	    similarity. Instead, our method uses alignment-free methods to
	    find probabilistic quasi-alignments for longer (typically 100 base
		    pairs) segments. Clustering is then used to create com-
	    pact models that can be used to analyze a set of sequences and to
	    score and classify unknown sequences against these models. In this
	    paper we expand prior work in two ways. We show how
	    quasi-alignments can be expanded into larger quasi-aligned sections
	    and we develop a method to classify short sequence fragments. The
	    latter is especially useful when working with Next-Generation
	    Sequencing (NGS) techniques that generate output in the form of
	    relatively short reads. We have conducted extensive experiments 
	    using fragments from bacterial 16S rRNA sequences obtained from the
	    Greengenes project and our results show that the new
	    quasi-alignment based approach can provide excellent results as
	    well as overcome some of the restrictions of by the widely used
	    Ribosomal Database Project (RDP) classifier.},
  category = {bioinformatics}
}
@inproceedings{hahsler:Shaiba2013,
  author = {Hadil Shaiba and Michael Hahsler},
  title = {Intensity Prediction Model for Tropical
  Cyclone Rapid Intensification Events},
  booktitle = {Proceedings of the IADIS Applied Computing 2013 (AC 2013)
      Conference, Fort Worth, TX},
  year = {2013},
  month = {October},
  pages = {},
  editor = {},
  isbn = {978-989-8533-20-3},
  abstract = {Tropical Cyclones (TC) create strong wind and rain and can cause significant human and financial losses. Many major
  hurricanes in the Atlantic Ocean undergo rapid intensification (RI). RI events happen when the strength of the storm
  increases rapidly within 24 hours. Improving the hurricane’s intensity prediction model by accurately detecting the
  occurrence and predicting the intensity of an RI event can help avoid human and financial losses. In this paper we
  analyzed RI events in the Atlantic Basin and investigated the use of a combination of three different models to predict RI
  events. The first and second models are simple location and time-based models which use the conditional probability of
  intensification given the day of the year and the location of the storm. The third model uses a data mining technique
  which is based on the Extensible Markov Chain Model (EMM) which clusters the hurricane’s lifecycle into states and
  then uses the transition probabilities between these states for prediction. One of the main characteristics of a Markov
  Chain is that the next state depends only on the current state and by knowing the current state we can predict future states
  which provide us with estimates for future intensities. In future research we plan to test each model independently and
  combinations of the models by comparing them to the current best prediction models.},
  category = {earth}
}
@article{hahsler:Nagar2013,
  author = {Anurag Nagar and Michael Hahsler},
  title = {Fast discovery and visualization of conserved regions in {DNA} sequences using quasi-alignment},
  journal = {BMC Bioinformatics},
  year = {2013},
  volume = {14},
  number = {Suppl. 11},
  pages = {},
  url = {http://www.biomedcentral.com/1471-2105/14/S11/S2/abstract},
  doi = {10.1186/1471-2105-14-S11-S2},
  abstract = {
	Next Generation Sequencing techniques are producing enormous amounts of
	biological sequence data and analysis becomes a major computational
	problem. Currently, most analysis, especially the identification of
	conserved regions, relies heavily on Multiple Sequence Alignment,
	which has a polynomial run time in terms of the number of sequences.
	Often significant computational resources are required. In this
	work, we present a method to efficiently discover regions of high
	similarity across multiple sequences without performing expensive
	sequence alignment. The method is based on approximating edit
	distance between segments of sequences using p-mer frequency
	counts. Then, efficient high-throughput data stream clustering is
	used to group highly similar segments into so called
	quasi-alignments. Quasi-alignments can be used for a variety of
	tasks such as species characterization and identification,
	phylogenetic analysis, functional analysis of sequences and, as in this
	paper, for discovering conserved regions.
	
	In this paper, we show that quasi-alignments can be used to
	discover highly similar segments across multiple sequences from
	related or different genomes efficiently and accurately.
	Experiments on a large number of unaligned 16S rRNA sequences
	obtained from the Greengenes database show that the method is able
	to identify conserved regions which agree with know hypervariable
	regions in 16S rRNA. Furthermore, the experiments show that the
	proposed method scales well for large data sets with a run time
	linear in the number of sequences, whereas existing multiple
	sequence alignment methods (such as Clustal) need a superlinear
	polynomial run time.
	
	Quasi-alignment-based algorithms can detect highly
	similar regions and conserved areas across multiple
	sequences. Since the run time is linear and the
	sequences are converted into a compact clustering
	model, we are able to identify conserved regions fast
	or even interactively using a standard PC. Our method
	has many potential applications such as finding
	characteristic signature sequences for families of
	organisms and studying conserved and variable regions
	in, for example, 16S rRNA.
    },
  category = {bioinformatics, visualization}
}
@incollection{hahsler:Bolanos2012,
  author = {Matthew Bola{\~n}os and John Forrest and Michael Hahsler},
  title = {Clustering Large Datasets using Data Stream Clustering Techniques},
  booktitle = {Data Analysis, Machine Learning and Knowledge Discovery},
  year = {2014},
  pages = {135--143},
  editor = {Myra Spiliopoulou and Lars Schmidt-Thieme and Ruth Janning},
  series = {Studies in Classification, Data Analysis, and Knowledge Organization},
  publisher = {Springer-Verlag},
  abstract = {Unsupervised identification of groups in large data sets is important for many machine learning and knowledge discovery applications. Conventional clustering approaches (k-means, hierarchical clustering, etc.) typically do not scale well for very large data sets. In recent years, data stream clustering algorithms have been proposed which can deal efficiently with potentially unbounded streams of data. This paper is the first to investigate the use of data stream clustering algorithms as light-weight alternatives to conventional algorithms on large non-streaming data. We will discuss important issue including order dependence and report the results of an initial study using several synthetic and real-world data sets.},
  pdf = {http://michael.hahsler.net/research/misc/gfkl2012hahsler17.pdf},
  doi = {10.1007/978-3-319-01595-8_15},
  category = {stream mining}
}
@inproceedings{hahsler:Nagar2012b,
  author = {Anurag Nagar and Michael Hahsler},
  title = {A Novel Quasi-Alignment-Based Method for Discovering Conserved
    Regions in Genetic Sequences},
  booktitle = {Proceedings of the IEEE BIBM 2012 Workshop on Data-Mining of Next-Generation Sequencing},
  year = {2012},
  month = {October},
  pages = {},
  location = {Philadelphia, PA},
  date = {October 4--7, 2012},
  publisher = {IEEE Computer Society Press},
  editor = {},
  abstract = {
    This paper presents an alignment-free technique
	to efficiently discover similar regions in large sets of biological
	sequences using position sensitive p-mer frequency clustering.
	A set of sequences is broken down into segment and then a
	frequency distribution over all oligomers of size p (referred
		to as p-mers) is obtained to summarize each segment. These
	summaries are clustered while the order of segments in the set
	of sequences is preserved in a Markov-type model. Sequence
	segments within each cluster have very similar DNA/RNA
	patterns and form a so called quasi-alignment. This fact can
	be used for a variety of tasks such as species characterization
	and identification, phylogenetic analysis, functional analysis of
	sequences and, as in this paper, for discovering conserved
	regions. Our method is computationally more efficient than
	multiple sequences alignment since it can apply modern data
	stream clustering algorithms which run in time linear in the
	number of segments and thus can help discover highly similar
	regions across a large number of sequences efficiently. In
	this paper, we apply the approach to efficiently discover and
	visualize conserved regions in 16S rRNA.
    },
  pdf = {http://michael.hahsler.net/research/misc/BIBM_2012_ConservedRegions.pdf},
  doi = {10.1109/BIBMW.2012.6470216},
  category = {bioinformatics, visualization}
}
@inproceedings{hahsler:Nagar2012,
  author = {Anurag Nagar and Michael Hahsler},
  title = {Using Text and Data Mining Techniques to extract Stock Market Sentiment from Live News Streams},
  booktitle = {2012 International Conference on Computer Technology and Science (ICCTS 2012)},
  year = {2012},
  month = {August},
  pages = {},
  location = {New Delhi, India},
  date = {August 18--19, 2012},
  publisher = {},
  editor = {},
  abstract = {Analyzing stock market trends and sentiment is an interdisciplinary area of research being undertaken by
    many disciplines such as Finance, Computer Science, Statistics, and Economics. It has been well established
    that real time news plays a strong role in the movement of stock prices. With the advent of electronic and
    online news sources, analysts have to deal with enormous amounts of real-time, unstructured streaming data.
    In this paper, we present an automated text mining based approach to aggregate news stories from diverse
    sources and create a News Corpus. The Corpus is filtered down to relevant sentences and analyzed using
    Natural Language Processing (NLP) techniques. A sentiment metric, called NewsSentiment, utilizing the
    count of positive and negative polarity words is proposed as a measure of the sentiment of the overall news
    corpus. We have used various open source packages and tools to develop the news collection and aggregation
    engine as well as the sentiment evaluation engine. Extensive experimentation has been done using news
    stories about various stocks. The time variation of NewsSentiment shows a very strong correlation with the
    actual stock price movement. Our proposed metric has many applications in analyzing current news stories
    and predicting stock trends for specific companies and sectors of the economy.
    },
  pdf = {http://michael.hahsler.net/research/misc/ICCTS_2012_NewsSentimentAnalysis.pdf},
  category = {stream mining}
}
@inproceedings{hahsler:Isaksson2012,
  author = {Charlie Isaksson and Margaret H. Dunham and Michael Hahsler},
  title = {{SOS}tream: Self Organizing Density-Based Clustering Over Data Stream},
  booktitle = {International Conference on Machine Learning and Data Mining
	(MLDM'2012)},
  year = {2012},
  month = {July},
  pages = {264--278},
  location = {Berlin, Germany},
  date = {July 13--20, 2012},
  publisher = {Springer},
  editor = {Petra Perner},
  series = {Lecture Notes in Computer Science LNAI 7376},
  abstract = {In this paper we propose a data stream clustering algorithm,
	called Self Organizing density based clustering over data Stream
	    (SOStream).  This algorithm has several novel features.  Instead of
	    using a fixed, user defined similarity threshold or a static grid,
	SOStream detects structure within fast evolving data streams by
	    automatically adapting the threshold for density-based clustering.
	    It also employs a novel cluster updating strategy which is inspired
	    by competitive learning techniques developed for Self Organizing
	    Maps (SOMs). In addition, SOStream has built-in online
	    functionality to support advanced stream clustering operations
	    including merging and fading. This makes SOStream completely online
	    with no separate offline components. Experiments performed on KDD
	    Cup'99 and artificial datasets indicate that SOStream is an
	    effective and superior algorithm in creating clusters of higher
	    purity while having lower space and time requirements compared to
	    previous stream clustering algorithms.  },
  pdf = {http://michael.hahsler.net/research/misc/SOStream_2012.pdf},
  url = {http://www.springer.com/computer/ai/book/978-3-642-31536-7},
  category = {stream mining}
}
@inproceedings{hahsler:ElDayeh2012,
  author = {Maya El Dayeh and Michael Hahsler},
  title = {Biological Pathway Completion Using Network Motifs and Random Walks on Graphs},
  booktitle = {IEEE Symposium on Computational
    Intelligence in Bioinformatics and Computational Biology (CIBCB 2012)},
  year = {2012},
  month = {May},
  pages = {229--236},
  location = {San Diego, CA},
  date = {May 9--12, 2012},
  publisher = {IEEE},
  editor = {},
  abstract = {
	Enhancing our understanding of cellular regulatory processes will
	ultimately lead to the development of better therapeutic
	strategies. Completing incomplete biological pathways through
	utilizing probabilistic protein-protein interaction (PPI) networks
	is one approach towards establishing knowledge of these regulatory
	processes.  Previous complex/pathway membership methods focused on
	uncovering candidate protein members from a probabilistic
	protein-protein interaction (PPI) networks. In our previous work,
	we defined the pathway completion problem and developed a method that
	uses network motifs to complete incomplete biological pathways.
	Network motifs allow us to take into consideration the intrinsic
	local structures of the pathways to identify the possible points of
	insertion of candidate proteins. However, our previous approach
	requires a complete and correct PPI network. In this paper, we
	extend our previous work and use random walks on a graph to address
	the pathway completion problem with incomplete PPI networks. We
	evaluate our proposed method using three probabilistic PPI networks
	and two KEGG (Kyoto Encyclopedia of Genes and Genomes) pathways.
	Moreover, we compare the accuracy of our network motif approach for
	pathway completion to the exiting approach for pathway membership.
	Our experiments show that our new approach achieves similar or
	better accuracy. In addition, our method identifies the possible
	locations and connections of the candidate proteins in the
	incomplete pathway, thus, allowing for targeted experimental
	verification.
    },
  pdf = {http://michael.hahsler.net/research/BiologicalPathway/CIBCB/Pathway_Random_Walk.pdf},
  category = {bioinformatics}
}
@inproceedings{hahsler:ElDayeh2011,
  author = {Maya El Dayeh and Michael Hahsler},
  title = {Analyzing incomplete biological pathways using network motifs},
  booktitle = {27th Symposium On Applied Computing (SAC 2012)},
  year = {2012},
  month = {},
  pages = {1355--1360},
  location = {Riva del Garda, Italy},
  date = {March 26--30, 2012},
  publisher = {ACM},
  volume = {2},
  number = {2},
  editor = {},
  abstract = {
    It is widely accepted that existing knowledge about the structure
	of many biological pathways is incomplete and uncovering
	missing proteins in a biological pathway can help guide targeted
	therapy and drug design and discovery. Current approaches
	address the complex/pathway membership problem by identifying
	potentially missing proteins using probabilistic protein-protein
	interaction (PPI) networks. In this paper we extend the idea of the
	pathway membership problem and define the pathway completion
	problem. In addition to finding possible protein candidates, this
	problem requires predicting the locations and connections of these
	proteins within a given incomplete pathway. We propose the use
	of network motifs to tackle the pathway completion problem. We
	present an algorithm which breaks down an incomplete pathway
	into a set of constituent motifs and then uses the proteins retrieved
	from a probabilistic PPI network to improve the motifs. This new
	approach also has the potential to improve solutions to the
	membership problem by better exploiting the local structures
	represented by network motifs. These new ideas are illustrated
	with a set of preliminary experiments.
    },
  pdf = {http://michael.hahsler.net/research/BiologicalPathway/Pathway_Motifs_SAC2012.pdf},
  category = {bioinformatics}
}
@inproceedings{hahsler:Jovanovic2011,
  author = {Vladimir Jovanovic and Margaret H. Dunham and Michael Hahsler and Yu Su},
  title = {Evaluating Hurricane Intensity Prediction Techniques in Real Time},
  booktitle = {Third IEEE ICDM Workshop on Knowledge Discovery from Climate Data, Proceedings of the of the 2011 IEEE International Conference on Data Mining Workshops (ICDMW 2011)},
  year = {2011},
  month = {December},
  pages = {23--29},
  location = {Vancouver, Canada},
  date = {December 10, 2011},
  publisher = {IEEE},
  editor = {},
  abstract = {
    While the accuracy of hurricane track prediction
	has been improving, predicting intensity, the maximum sustained wind
	speed, is still a very difficult challenge. This is problematic because
	the destructive power of a hurricane is directly related to its
	intensity. In this paper, we present Prediction Intensity Interval
	model for Hurricanes (PIIH) which combines sophisticated data mining
	techniques to create an online real time model for accurate intensity
	predictions and we present a web-based framework to dynamically compare
	PIIH to operational models used by the National Hurricane Center (NHC).
	The created dynamic website tracks, compares, and provides
	visualization to facilitate immediate comparisons of prediction
	techniques. This paper is a work in progress paper reporting on both,
    new features of the PIIH model and online visualization of the accuracy of
	that model as compared to other techniques.
    },
  pdf = {http://michael.hahsler.net/research/Hurricane/ICDMW_11/PIIH_Evaluation.pdf},
  doi = {10.1109/ICDMW.2011.78},
  category = {stream mining, earth}
}
@article{hahsler:Hahsler2011d,
  author = {Michael Hahsler and Sudheer Chelluboina and 
	Kurt Hornik and Christian Buchta},
  title = {The arules {R}-Package Ecosystem: Analyzing Interesting Patterns from Large Transaction Datasets},
  journal = {Journal of Machine Learning Research},
  year = {2011},
  volume = {12},
  number = {},
  pages = {1977--1981},
  url = {http://jmlr.csail.mit.edu/papers/v12/hahsler11a.html},
  abstract = {
    This paper describes the ecosystem of R add-on packages developed around
	the infrastructure provided by the package arules. The packages
	provide comprehensive functionality for analyzing interesting patterns
	including frequent itemsets, association rules, frequent sequences and
	for building applications like associative classification. After
        discussing the ecosystem's design we illustrate the ease of mining
	and visualizing rules with a short example.
    },
  category = {association rules, visualization}
}
@misc{hahsler:Hahsler2011c,
  author = {Michael Hahsler and Sudheer Chelluboina},
  title = {Visualizing Association Rules in Hierarchical Groups},
  howpublished = {Unpublished. Presented at the 42nd Symposium on the Interface:
    Statistical, Machine Learning, and Visualization Algorithms
    (Interface 2011)},
  year = {2011},
  month = {June},
  pages = {},
  location = {Cary, North Carolina},
  date = {June 1--3, 2011},
  publisher = {The Interface Foundation of North America},
  editor = {},
  abstract = {
    Association rule mining is one of the most popular data mining methods.
	However, mining association rules often results in a very large number
	of found rules, leaving the analyst with the task to go through all the
	rules and discover interesting ones. Sifting manually through large
	sets of rules is time consuming and strenuous. Visualization has a long
	history of making large amounts of data better accessible using
	techniques like selecting and zooming. However, most association rule
	visualization techniques are still falling short when it comes to a
	large number of rules. In this paper we present a new interactive
	visualization technique which lets the user navigate through a
	hierarchy of groups of association rules. We demonstrate how this new
	visualization techniques can be used to analyze a large sets of
	association rules with examples from our implementation in the
	R-package arulesViz.
    },
  pdf = {http://michael.hahsler.net/research/Interface2011/arulesViz/arulesViz.pdf},
  category = {association rules, visualization}
}
@inproceedings{hahsler:Hahsler2011,
  author = {Michael Hahsler and Margaret H. Dunham},
  title = {Temporal Structure Learning for Clustering Massive Data Streams
    in Real-Time},
  booktitle = {{SIAM} Conference on Data Mining ({SDM11})},
  year = {2011},
  month = {April},
  pages = {664--675},
  location = {Mesa, Arizona},
  date = {April 28--30, 2011},
  publisher = {SIAM},
  editor = {},
  abstract = {
	This paper describes one of the first attempts to model the temporal
	structure of massive data streams in real-time using data stream
	clustering.  Recently, many data stream clustering algorithms have
	been developed which efficiently find a partition of the data
	points in a data stream. However, these algorithms disregard the
	information represented by the temporal order of the data points in
	the stream which for many applications is an important part of the
	data stream.  In this paper we propose a new framework called
	Temporal Relationships Among Clusters for Data Streams (TRACDS)
	which allows to learn the temporal structure while clustering a
	data stream.  We identify, organize and describe the clustering
	operations which are used by state-of-the-art data stream
	clustering algorithms. Then we show that by defining a set of new
	operations to transform Markov Chains with states representing
	clusters dynamically, we can efficiently capture temporal ordering
	information. This framework allows us to preserve temporal
	relationships among clusters for any state-of-the-art data stream
	clustering algorithm with only minimal overhead.

	To investigate the usefulness of TRACDS, we evaluate the improvement of
	TRACDS over pure data stream clustering for anomaly detection using
	several synthetic and real-world data sets.  The experiments show that
	TRACDS is able to considerably improve the results even if we introduce
	a high rate of incorrect time stamps which is typical for real-world
	data streams.
    },
  url = {http://siam.omnibooksonline.com/2011datamining/data/papers/023.pdf#page=1},
  pdf = {http://michael.hahsler.net/research/TRACDS_SDM11/TRACDS_SDM11.pdf},
  category = {stream mining}
}
@article{hahsler:Dunham2010b,
  author = {Margaret H. Dunham and Michael Hahsler and Myra Spiliopoulou},
  title = {Novel Data Stream Pattern Mining, {Report on the StreamKDD'10 Workshop}},
  journal = {SIGKDD Explorations},
  year = {2010},
  volume = {12},
  number = {2},
  pages = {54--55},
  url = {http://www.sigkdd.org/explorations/issue.php?volume=12&issue=2&year=2010&month=12},
  abstract = {
    This report summarizes the First International Workshop on
	Novel Data Stream Pattern Mining held at the 16th ACM SIGKDD
	International Conference on Knowledge Discovery and Data
	Mining, on July 25 2010 in Washington, DC.
    },
  category = {stream mining}
}
@article{hahsler:Hahsler2011b,
  author = {Michael Hahsler and Kurt Hornik},
  title = {Dissimilarity Plots: {A} Visual Exploration Tool for Partitional Clustering},
  journal = {Journal of Computational and Graphical Statistics},
  year = {2011},
  month = {June},
  volume = {10},
  number = {2},
  pages = {335--354},
  doi = {10.1198/jcgs.2010.09139},
  pdf = {http://michael.hahsler.net/research/dissplot_JCGS2011/dissplot_preprint.pdf},
  abstract = {
    For hierarchical clustering, dendrograms are a convenient
    and powerful visualization technique. Although many visualization methods
    have been suggested for partitional clustering, their usefulness
    deteriorates quickly with increasing dimensionality of the data and/or they
    fail to represent structure between and within clusters simultaneously.  In
    this paper we extend (dissimilarity) matrix shading with several reordering
    steps based on seriation techniques.  Both ideas, matrix shading and
    reordering, have been well-known for a long time.  However, only recent
    algorithmic improvements allow us to solve or approximately solve the
    seriation problem efficiently for larger problems.  Furthermore, seriation
    techniques are used in a novel stepwise process (within each cluster and
    between clusters) which leads to a visualization technique that is
    able to present the structure between clusters and the micro-structure
    within clusters in one concise plot. This not only allows us to judge
    cluster quality but also makes mis-specification of the number of clusters
    apparent.  We give a detailed discussion of the construction of
    dissimilarity plots and demonstrate their usefulness with several examples.
    Experiments show that dissimilarity plots scale very well with increasing
    data dimensionality.
    },
  category = {seriation, visualization, optimization}
}
@inproceedings{hahsler:Yu2010,
  author = {Yu Su and Sudheer Chelluboina and Michael Hahsler and Margaret H. Dunham},
  title = {A New Data Mining Model for Hurricane Intensity Prediction},
  booktitle = {Second IEEE ICDM Workshop on Knowledge Discovery from Climate Data: Prediction, Extremes and Impacts, Proceedings of the of the 2010 IEEE International Conference on Data Mining Workshops (ICDMW 2010)},
  year = {2010},
  month = {December},
  pages = {98--105},
  location = {Sydney, Australia},
  date = {December 14, 2010},
  publisher = {IEEE},
  editor = {},
  doi = {10.1109/ICDMW.2010.158},
  abstract = {
    This paper proposes a new hurricane intensity prediction model, WFL-EMM,
    which is based on the data mining techniques of feature weight learning
    (WFL) and Extensible Markov Model (EMM). The data features used are
    those employed by one of the most popular intensity prediction models,
    SHIPS.  In our algorithm, the weights of the features are learned by a
    genetic algorithm (GA) using historical hurricane data. As the GA's
    fitness function we use the error of the intensity prediction by an EMM
    learned using given feature weights.  For fitness calculation we use a
    technique similar to $k$-fold cross validation on the training data.
    The best weights obtained by the genetic algorithm are used to build an
    EMM with all training data. This EMM is then applied to predict the
    hurricane intensities and compute prediction errors for the test data.
    Using historical data for the named Atlantic tropical cyclones from
    1982 to 2003, experiments demonstrate that WFL-EMM provides
    significantly more accurate intensity predictions than SHIPS within 72
    hours. Since we report here first results, we indicate how to improve
    WFL-EMM in the future.
    },
  pdf = {http://michael.hahsler.net/research/Hurricane/ICDMW_10/05693288.pdf},
  category = {stream mining, earth}
}
@book{hahsler:Dunham2010,
  editor = {Margaret H. Dunham and Michael Hahsler and Myra Spiliopoulou},
  title = {Proceedings of the First International Workshop on Novel Data Stream Pattern Mining Techniques (StreamKDD'10)},
  publisher = {ACM Press},
  year = 2010,
  issn = {978-1-4503-0226-5},
  location = {Washington, D.C.},
  publisher = {ACM},
  address = {New York, NY, USA},
  url = {http://portal.acm.org/citation.cfm?id=1833280},
  abstract = {Data stream mining gained in importance over the last years
    because it is indispensable for many real applications such as
    prediction and evolution of weather phenomena; security and anomaly
    detection in networks; evaluating satellite data; and mining health
    monitoring streams. Stream mining algorithms must take account of
    the unique properties of stream data: infinite data, temporal
    ordering, concept drifts and shifts, demand for scalability etc.
    
    This workshop brings together scholars working in different areas of
    learning on streams, including sensor data and other forms of accumulating
    data. Most of the papers in the next pages are on unsupervised learning
    with clustering methods. Issues addressed include the detection of outliers
    and anomalies, evolutionary clustering and incremental clustering, learning
    in subspaces of the complete feature space and learning with exploitation
    of context, deriving models from text streams and visualizing them. },
  category = {stream mining}
}
@article{hahsler:Kotamarti2010b,
  author = {Kotamarti, Rao M. and Hahsler, Michael and Raiford, Douglas and McGee, Monnie and Dunham, Margaret H.},
  title = {Analyzing Taxonomic Classification Using Extensible {M}arkov Models},
  journal = {Bioinformatics},
  volume = {26},
  number = {18},
  doi = {10.1093/bioinformatics/btq349},
  year = {2010},
  pages = {2235--2241},
  abstract = {
Motivation: As next generation sequencing is rapidly adding new genomes, their
correct placement in the taxonomy needs verification. However,
the current methods for confirming classification of a taxon or
suggesting revision for a potential misplacement relies on
computationally intense multi-sequence alignment followed by an
iterative adjustment of the distance matrix. Due to
intra-heterogeneity issues with the 16S rRNA marker, no
classifier is available for sub-genus level that could readily
suggest a classification for a novel 16S rRNA sequence.
Metagenomics further complicates the issue by generating
fragmented 16S rRNA sequences. This paper proposes a novel
alignment-free method for representing the microbial profiles
using Extensible Markov Models (EMM) with an extended
Karlin-Altschul statistical framework similar to the classic
alignment paradigm. We propose a Log Odds (LOD) score
classifier based on Gumbel difference distribution that
confirms correct classifications with statistical significance
qualifications and suggests revisions where necessary.
Results: We tested our method by generating a sub-genus level
classifier with which we re-evaluated classifications of 676
microbial organisms using the NCBI FTP database for the 16S
rRNA. The results confirm current classification for all genera
while ascertaining significance at 95\%. Furthermore, this novel
classifier isolates heterogeneity issues to a mere 12
strains while confirming classifications with
significance qualification for the remaining 98\%. The
models require less memory than that needed by
multi-sequence alignments and have better time
complexity than the current methods. The classifier
operates at sub-genus level and thus outperforms the
naive Bayes classifier of the RNA Database Project
where much of the taxonomic analysis is available
online. Finally, using information redundancy in model
building, we show that the method applies to
metagenomic fragment classification of 19 E.coli
strains.  
},
  url = {http://bioinformatics.oxfordjournals.org/content/26/18/2235},
  category = {bioinformatics}
}
@article{hahsler:Hahsler2010,
  author = {Michael Hahsler and Margaret H. Dunham},
  title = {{rEMM}: Extensible {M}arkov Model for Data Stream
	Clustering in {R}},
  journal = {Journal of Statistical Software},
  year = {2010},
  volume = {35},
  number = {5},
  pages = {1--31},
  doi = {10.18637/jss.v035.i05},
  pdf = {https://www.jstatsoft.org/index.php/jss/article/view/v035i05/v35i05.pdf},
  abstract = {
    Clustering streams
	of continuously arriving data has become an important application of
	data mining in recent years and efficient algorithms have been proposed
	by several researchers. However, clustering alone neglects the fact
	that data in a data stream is not only characterized by the proximity
	of data points which is used by clustering, but also by a temporal
	component. The Extensible Markov Model (EMM) adds the temporal
	component to data stream clustering by superimposing a dynamically
	adapting Markov Chain. In this paper we introduce the implementation of
	the R extension package rEMM which implements EMM and we discuss some
	examples and applications.
    },
  category = {stream mining}
}
@inproceedings{hahsler:Kotamarti2010,
  author = {Rao M. Kotamarti and Michael Hahsler and Douglas W. Raiford and Margaret H. Dunham},
  title = {Sequence transformation to a complex signature form for consistent Phylogenetic tree using Extensible {M}arkov Model},
  booktitle = {Proceedings of the 2010 IEEE Symposium on Computational Intelligence in Bioinformatics and Computational Biology (IEEE CIBCB 2010)},
  year = {2010},
  editor = {},
  pages = {},
  publisher = {IEEE},
  abstract = {
      Phylogenetic tree analysis using molecular sequences
	  continues to expand beyond the 16S rRNA marker. By addressing
	  the multi-copy issue known as the intra-heterogeneity,
      this paper restores the focus in using the 16S rRNA marker.
	  Through use of a novel learning and model building algorithm,
      the multiple gene copies are integrated into a compact complex
	  signature using the Extensible Markov Model (EMM). The
	  method clusters related sequence segments while preserving
	  their inherent order to create an EMM signature for a microbial
	  organism. A library of EMM signatures is generated
	  from which samples are drawn for phylogenetic analysis. By
	  matching the components of two signatures, referred to as
	  quasi-alignment, the differences are highlighted and scored.
	  Scoring quasi-alignments is done using adapted Karlin-Altschul
	  statistics to compute a novel distance metric. The metric satisfies
	  conditions of identity, symmetry, triangular inequality and the
	  four point rule required for a valid evolution distance metric.
	  The resulting distance matrix is input to PHYologeny Inference
	  Package (PHYLIP) to generate phylogenies using neighbor
	  joining algorithms. Through control of clustering in signature
	  creation, the diversity of similar organisms and their placement
	  in the phylogeny is explained. The experiments include analysis
	  of genus Burkholderia, a random microbial sample spanning
	  several phyla and a diverse sample that includes RNA of
	  Eukaryotic origin. The NCBI sequence data for 16S rRNA is
	  used for validation.
  },
  doi = {10.1109/CIBCB.2010.5510472},
  pdf = {http://michael.hahsler.net/research/EMMSA/EMMSA_CIBCB2010.pdf},
  category = {bioinformatics}
}
@techreport{hahsler:Kotamarti2009,
  author = {Rao M. Kotamarti and Douglas W. Raiford and Michael Hahsler and Yuhang Wang and Monnie McGee and Margaret H. Dunham},
  title = {Targeted Genomic signature profiling with Quasi-alignment statistics},
  institution = {COBRA Preprint Series},
  year = {2009},
  type = {Article},
  number = {63},
  month = {November},
  abstract = {
  Genome databases continue to expand with no change in the basic format of 
      sequence data. The prevalent use of the classic alignment based search
      tools like BLAST have significantly pushed the limits of genome isolate
      research. The relatively new frontier of Metagenomic research deals with
      thousands of diverse genomes with newer demands beyond the current
      homologue search and analysis. Compressing sequence data into a complex
      form could facilitate a broader range of sequence analyses. To this end,
  this research explores reorganizing sequence data as complex Markov
      signatures also known as Extensible Markov Models. Markov models have
      found successful application in biological sequence analysis
      applications through small, but important extensions to the original
      theory of Markov Chains. Extensible Markov Model (EMM) offers a novel
      Quasi-alignment complement to the classic alignment based homologous
      sequence search methods like BLAST. EMM based bioinformatic analysis
      (EMMBA) incorporates automatic learning which allows the Markov chain
      creation dynamically. Oligonucletide or genomic word frequencies form
      the core sequence data in alignment free methods. EMMBA extends the
      Karlin-Altschul statistics to bring forth an analogous E-Score
      statistical significance to the quasi-alignment domain. By consolidating
      a community of sequences into a single searchable profile, EMM
      methodology further reduces the search space for classification.  Through
      dynamic generation of the score matrix for each community profile, EMMBA
      fine tunes the score assignments. Each evaluation iteratively adjusts the
      profile score matrix to account for point probabilities of the query to
      ensure Karlin-Altschul assumptions are satisfied to derive meaningful
      statistical signifi- cance. The presence of multiple quasi-alignments
      resembles multiple local alignments of BLAST. Quasi-alignments are
      scored based on a difference distribution of Gumbel scores. Species
      signature profiles allow for statistical validation of novel species
      identification. Working in EMM transformation space speeds up
classification and generates distance matrix for differentiation. The techniques
and metrics presented are validated using the microbial 16s rRNA sequence data
from NCBI.

  },
  url = {http://biostats.bepress.com/cobra/ps/art63/},
  category = {bioinformatics}
}
@techreport{hahsler:Hahsler2009,
  author = {Michael Hahsler and Kurt Hornik},
  title = {Dissimilarity Plots: A Visual Exploration Tool for Partitional Clustering},
  institution = {Research Report Series, Department of Statistics and Mathematics, Wirtschaftsuniversit{\"a}t Wien},
  year = {2009},
  type = {Report},
  number = {89},
  address = {Augasse 2--6, 1090 Wien, Austria},
  month = {September},
  abstract = {For hierarchical clustering, dendrograms provide convenient
    and powerful visualization. Although many visualization methods have
    been suggested for partitional clustering, their usefulness
    deteriorates quickly with increasing dimensionality of the data and/or
    they fail to represent structure between and within clusters
    simultaneously. In this paper we extend (dissimilarity) matrix shading
    with several reordering steps based on seriation.  Both methods,
    matrix shading and seriation, have been well-known for a long time.
    However, only recent algorithmic improvements allow to use seriation
    for larger problems.  Furthermore, seriation is used in a novel
    stepwise process (within each cluster and between clusters) which
    leads to a visualization technique that is independent of the
    dimensionality of the data.  A big advantage is that it presents the
    structure between clusters and the micro-structure within clusters
    in one concise plot. This not only allows for judging
    cluster quality but also makes mis-specification of the number of clusters
    apparent.  We give a detailed discussion of the construction of
    dissimilarity plots and demonstrate their usefulness with several
    examples.},
  nopdf = {http://michael.hahsler.net/research/dissplot_workingpaper2009/dissplot.pdf},
  url = {http://epub.wu.ac.at/1244},
  category = {seriation, visualization, optimization}
}
@article{hahsler:Hahsler2007g,
  author = {Michael Hahsler and Kurt Hornik},
  title = {{TSP} -- {I}nfrastructure for the Traveling Salesperson
                           Problem},
  journal = {Journal of Statistical Software},
  year = {2007},
  volume = {23},
  pages = {1-21},
  number = {2},
  month = {December},
  abstract = {
The traveling salesperson (or, salesman) problem (TSP) is a well known and
    important combinatorial optimization problem.  The goal is to find the
    shortest tour that visits each city in a given list exactly once and then
    returns to the starting city.  Despite this simple problem statement,
solving the TSP is difficult since it belongs to the class of NP-complete
    problems.  The importance of the TSP arises besides from its theoretical
    appeal from the variety of its applications.  Typical applications in
    operations research include vehicle routing, computer wiring, cutting
    wallpaper and job sequencing.  The main application in statistics is
    combinatorial data analysis, e.g., reordering rows and columns of data
    matrices or identifying clusters.  In this paper we introduce the
    R~package TSP which provides a basic infrastructure for
    handling and solving the traveling salesperson problem.  The package
    features S3 classes for specifying a TSP and its (possibly optimal)
    solution as well as several heuristics to find good solutions. In addition,
it provides an interface to Concorde, one of the best exact TSP solvers
    currently available.},
  issn = {1548-7660},
  pdf = {https://www.jstatsoft.org/index.php/jss/article/view/v025i03/v25i03.pdf},
  doi = {10.18637/jss.v023.i02},
  nopdf = {http://michael.hahsler.net/research/TSP_jss2007/v23i02/v23i02.pdf},
  category = {seriation, visualization, optimization}
}
@article{hahsler:Hahsler2008,
  author = {Michael Hahsler and Kurt Hornik and Christian Buchta},
  title = {Getting Things in Order: An Introduction to the {R} 
      Package seriation},
  journal = {Journal of Statistical Software},
  year = {2008},
  volume = {25},
  pages = {1--34},
  number = {3},
  month = {March},
  abstract = {Seriation, i.e., finding a linear order for a set of objects
      given data and a loss or merit function, is a basic problem in data
          analysis.  Caused by the problem's combinatorial nature, it is hard
          to solve for all but very small sets.  Nevertheless, both exact
          solution methods and heuristics are available.  In this paper we
          present the package~seriation which provides the infrastructure for
          seriation with R.  The infrastructure comprises data structures to
          represent linear orders as permutation vectors, a wide array of
          seriation methods using a consistent interface, a method to calculate
          the value of various loss and merit functions, and several
          visualization techniques which build on seriation. To illustrate how
          easily the package can be applied for a variety of applications, a
          comprehensive collection of examples is presented.},
  issn = {1548-7660},
  pdf = {https://www.jstatsoft.org/index.php/jss/article/view/v014i15/v14i15.pdf},
  doi = {10.18637/jss.v025.i03},
  nopdf = {http://michael.hahsler.net/research/seriation_JSS2008/seriation.pdf},
  category = {seriation, visualization, optimization}
}
@techreport{hahsler:Hahsler2007e,
  author = {Michael Hahsler and Kurt Hornik and Christian Buchta},
  title = {Getting Things in Order: An Introduction to the {R} package seriation},
  institution = {Research Report Series, Department of Statistics and Mathematics,
	Wirtschaftsuniversit{\"a}t Wien},
  year = {2007},
  type = {Report},
  number = {58},
  address = {Augasse 2--6, 1090 Wien, Austria},
  month = {August},
  abstract = {     Seriation, i.e., finding a linear order for a set of objects
      given data and a loss or merit function, is a basic problem in data
          analysis. Caused by the problem's combinatorial nature, it is
          hard to solve for all but very small sets. Nevertheless, both exact
          solution methods and heuristics are available. In this paper we
          present the package seriation which provides the infrastructure for
          seriation with R. The infrastructure comprises data structures to
          represent linear orders as permutation vectors, a wide array of
          seriation methods using a consistent interface, a method to calculate
          the value of various loss and merit functions, and several
          visualization techniques which build on seriation. To illustrate how
          easily the package can be applied for a variety of applications, a
          comprehensive collection of examples is presented.  },
  nopdf = {http://michael.hahsler.net/research/seriation_working2007/seriation.pdf},
  url = {http://epub.wu.ac.at/852},
  category = {seriation, visualization, optimization}
}
@incollection{hahsler:Hahsler2007b,
  author = {Christoph Breidert and Michael Hahsler},
  title = {Adaptive Conjoint Analysis for Pricing Music Downloads},
  booktitle = {Advances in Data Analysis},
  year = {2007},
  pages = {409--416},
  editor = {R. Decker and H.-J. Lenz},
  series = {Studies in Classification, Data Analysis, and Knowledge Organization},
  publisher = {Springer-Verlag},
  abstract = {Finding the right pricing for music downloads is of ample importance
	to the recording industry and music download service providers. For
	the recently introduced music downloads, reference prices are still
	developing and to find a revenue maximizing pricing scheme is a challenging
	task. The most commonly used approach is to employ linear pricing
	(e.g., iTunes, musicload). Lately, subscription models have emerged,
	offering their customers unlimited access to streaming music for
	a monthly fee (e.g., Napster, RealNetworks). However, other pricing
	strategies could also be used, such as quantity rebates starting
	at certain download volumes. Research has been done in this field
	and Buxmann et al. (2005) have shown that price cuts can improve
	revenue. In this paper we apply different approaches to estimate
	consumer's willingness to pay (WTP) for music downloads and compare
	our findings with the pricing strategies currently used in the market.
	To make informed decisions about pricing, knowledge about the consumer's
	WTP is essential. Three approaches based on adaptive conjoint analysis
	to estimate the WTP for bundles of music downloads are compared.
	Two of the approaches are based on a status-quo product (at market
	price and alternatively at an individually self-stated price), the
	third approach uses a linear model assuming a fixed utility per title.
	All three methods seem to be robust and deliver reasonable estimations
	of the respondent's WTPs. However, all but the linear model need
	an externally set price for the status-quo product which can introduce
	a bias.},
  pdf = {http://michael.hahsler.net/research/conjoint_gfkl2006/conjoint_music.pdf},
  url = {http://dx.doi.org/10.1007/978-3-540-70981-7},
  category = {marketing, optimization}
}
@incollection{hahsler:Hahsler2007,
  author = {Michael Hahsler and Kurt Hornik},
  title = {Building on the arules Infrastructure for Analyzing Transaction Data
	with {R}},
  booktitle = {Advances in Data Analysis},
  pages = {449--456},
  year = {2007},
  editor = {R. Decker and H.-J. Lenz},
  series = {Studies in Classification, Data Analysis, and Knowledge Organization},
  publisher = {Springer-Verlag},
  abstract = {The free and extensible statistical computing environment R with its
	enormous number of extension packages already provides many state-of-the-art
	techniques for data analysis. Support for association rule mining,
	a popular exploratory method which can be used, among other purposes,
	for uncovering cross-selling opportunities in \emph{market baskets,}
	has become available recently with the R extension package~arules.
	After a brief introduction to transaction data and association rules,
	we present the formal framework implemented in arules and demonstrate
	how clustering and association rule mining can be applied together
	using a market basket data set from a typical retailer. This paper
	shows that implementing a basic infrastructure with formal classes
	in R provides an extensible basis which can very efficiently be employed
	for developing new applications (such as clustering transactions)
	in addition to association rule mining.},
  pdf = {http://michael.hahsler.net/research/arules_gfkl2006/arules_gfkl2006.pdf},
  doi = {10.1007/978-3-540-70981-7_51},
  category = {association rules}
}
@article{hahsler:Hahsler2007c,
  author = {Michael Hahsler and Kurt Hornik},
  title = {New Probabilistic Interest Measures for Association Rules},
  journal = {Intelligent Data Analysis},
  year = {2007},
  volume = {11},
  number = {5},
  pages = {437--455},
  abstract = {Mining association rules is an important technique for discovering
	meaningful patterns in transaction databases. Many different measures
	of interestingness have been proposed for association rules. However,
	these measures fail to take the probabilistic properties of the mined
	data into account. In this paper, we start with presenting a simple
	probabilistic framework for transaction data which can be used to
	simulate transaction data when no associations are present. We use
	such data and a real-world database from a grocery outlet to explore
	the behavior of confidence and lift, two popular interest measures
	used for rule mining. The results show that confidence is systematically
	influenced by the frequency of the items in the left hand side of
	rules and that lift performs poorly to filter random noise in transaction
	data. Based on the probabilistic framework we develop two new interest
	measures, hyper-lift and hyper-confidence, which can be used to filter
	or order mined association rules. The new measures show significantly
	better performance than lift for applications where spurious rules
	are problematic. },
  issn = {1088-467X},
  url = {http://iospress.metapress.com/openurl.asp?genre=article&issn=1088-467X&volume=11&issue=5&spage=437},
  pdf = {http://michael.hahsler.net/research/hyperConfidence_IDA2007/hyperConfidence.pdf},
  category = {association rules}
}
@article{hahsler:Hahsler2007d,
  author = {Michael Hahsler and Christian Buchta and Kurt Hornik},
  title = {Selective Association Rule Generation},
  journal = {Computational Statistics},
  year = {2008},
  volume = {23},
  pages = {303--315},
  number = {2},
  month = {April},
  doi = {10.1007/s00180-007-0062-z},
  url = {http://dx.doi.org/10.1007/s00180-007-0062-z},
  abstract = {Mining association rules is a popular and well researched
    method for discovering interesting relations between variables in
    large databases. A practical problem is that at medium to low support
    values often a large number of frequent itemsets and an even larger
    number of association rules are found in a database.  A widely used
    approach is to gradually increase minimum support and minimum
    confidence or to filter the found rules using increasingly strict
    constraints on additional measures of interestingness until the set of
    rules found is reduced to a manageable size.  In this paper we describe
    a different approach which is based on the idea to first define a set
    of ``interesting'' itemsets (e.g., by a mixture of mining and expert
    knowledge) and then, in a second step to selectively generate rules
    for only these itemsets.  The main advantage of this approach over
    increasing thresholds or filtering rules is that the number of rules
    found is significantly reduced while at the same time it is not
    necessary to increase the support and confidence thresholds which
    might lead to missing important information in the database.
  },
  issn = {0943-4062},
  pdf = {http://michael.hahsler.net/research/ruleGeneration_cost2007/ruleInduction_CompStat.pdf},
  category = {association rules}
}
@article{hahsler:Reutterer2007,
  author = {Thomas Reutterer and Michael Hahsler and Kurt Hornik},
  title = {{Data Mining und Marketing am Beispiel der explorativen Warenkorbanalyse}},
  journal = {{Marketing ZFP}},
  year = {2007},
  volume = {29},
  number = {3},
  pages = {165--181},
  abstract = {Techniken des Data Mining stellen f\"ur die Marketingforschung
      und {}-praxis eine zunehmend bedeutsamere Bereicherung des
          herk\"ommlichen Methodenarsenals dar. Mit dem Einsatz solcher
          prim\"ar datengetriebener Analysewerkzeuge wird das Ziel verfolgt,
      marketingrelevante Informationen ''intelligent'' aus
          gro{\ss}en Datenbanken (sog. Data Warehouses) zu extrahieren und
          f\"ur die weitere Entscheidungsvorbereitung in geeigneter Form
          aufzubereiten. Im vorliegenden Beitrag werden Ber\"uhrungspunkte
          zwischen Data Mining und Marketing diskutiert und der konkrete
          Einsatz ausgew\"ahlter Data{}-Mining{}-Methoden am Beispiel der
          explorativen Warenkorb{}- bzw.  Sortimentsverbundanalyse f\"ur einen
          Transaktionsdatensatz aus dem Lebensmitteleinzelhandel demonstriert.
          Zur Anwendung gelangen dabei Techniken aus dem Bereich der
          klassischen Affinit\"atsanalyse, ein \textit{K}{}-Medoid{}-Verfahren
          der Clusteranalyse sowie Werkzeuge zur Generierung und
          anschlie{\ss}enden Beurteilung von Assoziationsregeln zwischen im
          Sortiment enthaltenen Warengruppen. Die Vorgehensweise wird dabei
          anhand des mit der Statistik{}-Software R frei verf\"ugbaren
          Erweiterungspakets \textbf{arules} illustriert.
  },
  url = {http://www.jstor.org/stable/41922084},
  category = {association rules, marketing}
}
@techreport{hahsler:Hahsler2006g,
  author = {Michael Hahsler and Kurt Hornik},
  title = {{TSP} -- {I}nfrastructure for the Traveling
  Salesperson Problem},
  institution = {Research Report Series, Department of Statistics and Mathematics,
	Wirtschaftsuniversit{\"a}t Wien},
  year = {2006},
  type = {Report},
  number = {45},
  address = {Augasse 2--6, 1090 Wien, Austria},
  month = {December},
  abstract = {The traveling salesperson or salesman problem (TSP) is a well
      known and important combinatorial optimization problem. The goal is to
          find the shortest tour that visits each city in a given list exactly
          once and then returns to the starting city. Despite this simple
          problem statement, solving the TSP is difficult since it belongs to
          the class of NP-complete problems.  The importance of the TSP arises
          besides from its theoretical appeal from the variety of its
          applications. In addition to vehicle routing, many other
          applications, e.g., computer wiring, cutting wallpaper, job
          sequencing or several data visualization techniques, require the
          solution of a TSP.  In this paper we introduce the R package TSP
          which provides a basic infrastructure for handling and solving the
          traveling salesperson problem. The package features S3 classes for
          specifying a TSP and its (possibly optimal) solution as well as
          several heuristics to find good solutions. In addition, it provides
          an interface to Concorde, one of the best exact TSP solvers currently
          available.},
  nopdf = {http://michael.hahsler.net/research/TSP_working2006/TSP.pdf},
  url = {http://epub.wu.ac.at/1230},
  category = {seriation, visualization, optimization}
}
@article{hahsler:Hahsler2006e,
  author = {Christoph Breidert and Michael Hahsler and Thomas Reutterer},
  title = {A Review of Methods for Measuring Willingness-to-Pay},
  journal = {Innovative Marketing},
  volume = {2},
  number = {4},
  pages = {8--32},
  year = {2006},
  abstract = {Knowledge about a product's willingness-to-pay on behalf of its (potential)
	customers plays a crucial role in many areas of marketing management
	like pricing decisions or new product development. Numerous approaches
	to measure willingness-to-pay with differential conceptual foundations
	and methodological implications have been presented in the relevant
	literature so far. This article provides the reader with a systematic
	overview of the relevant literature on these competing approaches
	and associated schools of thought, recognizes their respective merits
	and discusses obstacles and issues regarding their adoption to measuring
	willingness-to-pay. Because of its practical relevance, special focus
	will be put on indirect surveying techniques and, in particular,
	conjoint-based applications will be discussed in more detail. The
	strengths and limitations of the individual approaches are discussed
	and evaluated from a managerial point of view.},
  issn = {1814-2427},
  pdf = {http://michael.hahsler.net/research/misc/InnovativeMarketing_2006.pdf},
  url = {http://businessperspectives.org/journals/innovative-marketing/issue-111/a-review-of-methods-for-measuring-willingness-to-pay},
  category = {marketing}
}
@article{hahsler:Hahsler2006a,
  author = {Michael Hahsler},
  title = {A Model-Based Frequency Constraint for Mining Associations from Transaction
	Data},
  journal = {Data Mining and Knowledge Discovery},
  year = {2006},
  volume = {13},
  pages = {137--166},
  number = {2},
  month = {September},
  abstract = {Mining frequent itemsets is a popular method for finding associated
	items in databases. For this method, support, the co-occurrence frequency
	of the items which form an association, is used as the primary indicator
	of the associations's significance. A single user-specified support
	threshold is used to decided if associations should be further investigated.
	Support has some known problems with rare items, favors shorter itemsets
	and sometimes produces misleading associations. In this paper we
	develop a novel model-based frequency constraint as an alternative
	to a single, user-specified minimum support. The constraint utilizes
	knowledge of the process generating transaction data by applying
	a simple stochastic mixture model (the NB model) which allows for
	transaction data's typically highly skewed item frequency distribution.
	A user-specified precision threshold is used together with the model
	to find local frequency thresholds for groups of itemsets. Based
	on the constraint we develop the notion of NB-frequent itemsets and
	adapt a mining algorithm to find all NB-frequent itemsets in a database.
	In experiments with publicly available transaction databases we show
	that the new constraint provides improvements over a single minimum
	support threshold and that the precision threshold is more robust
	and easier to set and interpret by the user. },
  doi = {10.1007/s10618-005-0026-2},
  issn = {1384-5810},
  pdf = {http://michael.hahsler.net/research/nbd_dami2005/nbd_associationrules_dami2005.pdf},
  url = {http://dx.doi.org/10.1007/s10618-005-0026-2},
  category = {association rules}
}
@techreport{hahsler:Hahsler2006c,
  author = {Michael Hahsler and Kurt Hornik},
  title = {New Probabilistic Interest Measures for Association Rules},
  institution = {Research Report Series, Department of Statistics and Mathematics,
	Wirtschaftsuniversit{\"a}t Wien},
  year = {2006},
  type = {Report},
  number = {38},
  address = {Augasse 2--6, 1090 Wien, Austria},
  month = {August},
  abstract = { Mining association rules is an important technique for discovering
	meaningful patterns in transaction databases. Many different measures
	of interestingness have been proposed for association rules. However,
	these measures fail to take the probabilistic properties of the mined
	data into account. In this paper, we start with presenting a simple
	probabilistic framework for transaction data which can be used to
	simulate transaction data when no associations are present. We use
	such data and a real-world database from a grocery outlet to explore
	the behavior of confidence and lift, two popular interest measures
	used for rule mining. The results show that confidence is systematically
	influenced by the frequency of the items in the left hand side of
	rules and that lift performs poorly to filter random noise in transaction
	data. Based on the probabilistic framework we develop two new interest
	measures, hyper-lift and hyper-confidence, which can be used to filter
	or order mined association rules. The new measures show significant
	better performance than lift for applications where spurious rules
	are problematic. },
  nopdf = {http://michael.hahsler.net/research/arules_working2006/hyperConfidence.pdf},
  url = {http://epub.wu.ac.at/1286},
  category = {association rules}
}
@incollection{hahsler:Hahsler2006f,
  author = {Michael Hahsler and Kurt Hornik and Thomas Reutterer},
  title = {{Warenkorbanalyse mit Hilfe der Statistik-Software R}},
  booktitle = {Innovationen in Marketing},
  year = {2006},
  editor = {Peter Schnedlitz and Renate Buber and Thomas Reutterer and Arnold
	Schuh and Christoph Teller},
  pages = {144--163},
  publisher = {Linde-Verlag},
  abstract = {Die Warenkorb- oder Sortimentsverbundanalyse bezeichnet eine Reihe
	von Methoden zur Untersuchung der bei einem Einkauf gemeinsam nachgefragten
	Produkte oder Kategorien aus einem Handelssortiment. In diesem Beitrag
	wird die explorative Warenkorbanalyse n{\"a}her beleuchtet, welche eine
	Verdichtung und kompakte Darstellung der in (zumeist sehr umfangreichen)
	Transaktionsdaten des Einzelhandels auffindbaren Verbundbeziehungen
	beabsichtigt. Mit einer enormen Anzahl an verf{\"u}gbaren Erweiterungspaketen
	bietet sich die frei verf{\"u}gbare Statistik-Software R als ideale Basis
	f{\"u}r die Durchf{\"u}hrung solcher Warenkorbanalysen an. Die im Erweiterungspaket
	arules vorhandene Infrastruktur f{\"u}r Transaktionsdaten stellt eine
	flexible Basis f{\"u}r die Warenkorbanalyse bereit. Unterst{\"u}tzt wird
	die effiziente Darstellung, Bearbeitung und Analyse von Warenkorbdaten
	mitsamt beliebigen Zusatzinformationen zu Produkten (zum Beispiel
	Sortimentshierarchie) und zu Transaktionen (zum Beispiel Umsatz oder
	Deckungsbeitrag). Das Paket ist nahtlos in R integriert und erm{\"o}glicht
	dadurch die direkte Anwendung von bereits vorhandenen modernsten
	Verfahren f{\"u}r Sampling, Clusterbildung und Visualisierung von Warenkorbdaten.
	Zus{\"a}tzlich sind in arules g{\"a}ngige Algorithmen zum Auffinden von Assoziationsregeln
	und die notwendigen Datenstrukturen zur Analyse von Mustern vorhanden.
	Eine Auswahl der wichtigsten Funktionen wird anhand eines realen
	Transaktionsdatensatzes aus dem Lebensmitteleinzelhandel demonstriert.},
  pdf = {http://michael.hahsler.net/research/arules_WUCompDay2006/arules.pdf},
  category = {association rules, marketing}
}
@incollection{hahsler:Hahsler2006b,
  author = {Michael Hahsler and Kurt Hornik and Thomas Reutterer},
  title = {Implications of Probabilistic Data Modeling for Mining Association
	Rules},
  booktitle = {From Data and Information Analysis to Knowledge Engineering},
  year = {2006},
  editor = {M. Spiliopoulou and R. Kruse and C. Borgelt and A. N{\"u}rnberger
	and W. Gaul},
  series = {Studies in Classification, Data Analysis, and Knowledge Organization},
  pages = {598--605},
  publisher = {Springer-Verlag},
  abstract = {Mining association rules is an important technique for discovering
	meaningful patterns in transaction databases. In the current literature,
	the properties of algorithms to mine association rules are discussed
	in great detail. We present a simple probabilistic framework for
	transaction data which can be used to simulate transaction data when
	no associations are present. We use such data and a real-world grocery
	database to explore the behavior of confidence and lift, two popular
	interest measures used for rule mining. The results show that confidence
	is systematically influenced by the frequency of the items in the
	left-hand-side of rules and that lift performs poorly to filter random
	noise in transaction data. The probabilistic data modeling approach
	presented in this paper not only is a valuable framework to analyze
	interest measures but also provides a starting point for further
	research to develop new interest measures which are based on statistical
	tests and geared towards the specific properties of transaction data.},
  pdf = {http://michael.hahsler.net/research/probRuleMining_gfkl2005/probRuleMining_gfkl2005.pdf},
  url = {http://www.springerlink.com/content/978-3-540-31314-4/},
  category = {association rules}
}
@incollection{hahsler:Breidert2005,
  author = {Christoph Breidert and Michael Hahsler and Lars Schmidt-Thieme},
  title = {Reservation Price Estimation by Adaptive Conjoint Analysis},
  booktitle = {Classification - the Ubiquitous Challenge},
  year = {2005},
  editor = {Weihs, Claus and Gaul, Wolfgang},
  series = {Studies in Classification, Data Analysis, and Knowledge Organization},
  pages = {577--584},
  publisher = {Springer-Verlag},
  abstract = {Though reservation prices are needed for many business decision processes,
	e.g., pricing new products, it often turns out to be difficult to
	measure them. Many researchers reuse conjoint analysis data with
	price as an attribute for this task (e.g., Kohli and Mahajan (1991)).
	In this setting the information if a consumer buys a product at all
	is not elicited which makes reservation price estimation impossible.
	We propose an additional interview scene at the end of the adaptive
	conjoint analysis (Johnson (1987)) to estimate reservation prices
	for all product configurations. This will be achieved by the usage
	of product stimuli as well as price scales that are adapted for each
	proband to reflect individual choice behavior. We present preliminary
	results from an ongoing large-sample conjoint interview of customers
	of a major mobile phone retailer in Germany.},
  pdf = {http://michael.hahsler.net/research/reservation_gfkl2004/gfkl2004.pdf},
  url = {http://www.springerlink.com/content/978-3-540-28084-2/},
  category = {marketing, optimization}
}
@inproceedings{hahsler:Fessler2005,
  author = {Georg Fessler and Michael Hahsler and Michaela Putz},
  title = {{ePubWU -- Erfahrungen mit einer Volltext an der Wirtschaftsuniversit{\"a}t
	Wien}},
  booktitle = {Bibliotheken -- Fundament der Bildung, 28. \"Osterreichischer Bibliothekartag
	2004},
  year = {2005},
  editor = {Christian Enichlmayr},
  series = {Schriftenreihe der O{\"o}. Landesbibliothek},
  pages = {190--193},
  abstract = {ePubWU ist eine elektronische Plattform f\"ur wissenschaftliche Publikationen
	der Wirtschaftsuniversit\"at Wien, wo forschungsbezogene Ver\"offentlichungen
	der WU im Volltext \"uber das WWW zug\"anglich gemacht werden. ePubWU
	wird als Gemeinschaftsprojekt der Universit\"atsbibliothek der Wirtschaftsuniversit\"at
	Wien und der Abteilung f\"ur Informationswirtschaft betrieben. Derzeit
	werden in ePubWU zwei Publikationsarten gesammelt - Working Papers
	und Dissertationen. In dem Beitrag werden Erfahrungen der \"uber zweij\"ahrigen
	Laufzeit des Projektes dargestellt, u.a. in den Bereichen Akquisition,
	Workflows, Erschlie{\ss}ung, Vermittlung.},
  isbn = {3-85252-684-1},
  category = {digital libraries}
}
@incollection{hahsler:Hahsler2004a,
  author = {Michael Hahsler},
  title = {A Quantitative Study of the Adoption of Design Patterns by Open Source
	Software Developers},
  booktitle = {Free/Open Source Software Development},
  publisher = {Idea Group Publishing},
  year = {2005},
  editor = {S. Koch},
  pages = {103--123},
  abstract = {Several successful projects (Linux, Free-BSD, BIND, Apache, etc.)
	showed that the collaborative and self-organizing process of developing
	open source software produces reliable, high quality software. Without
	doubt, the open source software development process differs in many
	ways from the traditional development process in a commercial environment.
	An interesting research question is how these differences influence
	the adoption of traditional software engineering practices. In this
	chapter we investigate how design patterns, a widely accepted software
	engineering practice, are adopted by open source developers for documenting
	changes. We analyze the development process of almost 1,000 open
	source software projects using version control information and explore
	differences in pattern adoption using characteristics of projects
	and developers. By analyzing these differences we provide evidence
	that design patterns are an important practice in open source projects
	and that there exist significant differences between developers who
	use design patterns and who do not.},
  pdf = {http://michael.hahsler.net/research/patterns_oss2004/OSS_patterns_preprint.pdf},
  url = {http://www.idea-group.com/books/details.asp?id=4368},
  category = {software engineering}
}
@inproceedings{hahsler:Hahsler2005e,
  author = {Michael Hahsler},
  title = {Optimizing Web Sites for Customer Retention},
  booktitle = {Proceedings of the 2005 International Workshop on Customer Relationship
	Management: Data Mining Meets Marketing, November 18--19, 2005, New
	York City, USA},
  year = {2005},
  editor = {Bing Liu and Myra Spiliopoulou and Jaideep Srivastava and Alex Tuzhilin},
  abstract = {With customer relationship management (CRM) companies move away from
	a mainly product-centered view to a customer-centered view. Resulting
	from this change, the effective management of how to keep contact
	with customers throughout different channels is one of the key success
	factors in today's business world. Company Web sites have evolved
	in many industries into an extremely important channel through which
	customers can be attracted and retained. To analyze and optimize
	this channel, accurate models of how customers browse through the
	Web site and what information within the site they repeatedly view
	are crucial. Typically, data mining techniques are used for this
	purpose. However, there already exist numerous models developed in
	marketing research for traditional channels which could also prove
	valuable to understanding this new channel. In this paper we propose
	the application of an extension of the Logarithmic Series Distribution
	(LSD) model repeat-usage of Web-based information and thus to analyze
	and optimize a Web Site's capability to support one goal of CRM,
	to retain customers. As an example, we use the university's blended
	learning web portal with over a thousand learning resources to demonstrate
	how the model can be used to evaluate and improve the Web site's
	effectiveness.},
  pdf = {http://michael.hahsler.net/research/LSD_CRM2005/LSD_CRM2005.pdf},
  category = {marketing, recommender systems}
}
@techreport{hahsler:Hahsler2005c,
  author = {Michael Hahsler and Bettina Gr{\"u}n and Kurt Hornik},
  title = {A Computational Environment for Mining Association Rules and Frequent
	Item Sets},
  institution = {Research Report Series, Department of Statistics and Mathematics,
	Wirtschaftsuniversit{\"a}t Wien},
  year = {2005},
  type = {Report},
  number = {15},
  address = {Augasse 2--6, 1090 Wien, Austria},
  month = {April},
  abstract = { Mining frequent itemsets and association rules is a popular and well
	researched approach to discovering interesting relationships between
	variables in large databases. The R package arules presented in this
	paper provides a basic infrastructure for creating and manipulating
	input data sets and for analyzing the resulting itemsets and rules.
	The package also includes interfaces to two fast mining algorithms,
	the popular C implementations of Apriori and Eclat by Christian Borgelt.
	These algorithms can be used to mine frequent itemsets, maximal frequent
	itemsets, closed frequent itemsets and association rules. },
  nopdf = {http://michael.hahsler.net/research/arules_workingpaper15_2005/arules.pdf},
  url = {http://epub.wu.ac.at/132},
  category = {association rules}
}
@article{hahsler:Hahsler2005f,
  author = {Michael Hahsler and Bettina Gr{\"u}n and Kurt Hornik},
  title = {arules -- {A} Computational Environment for Mining Association Rules
	and Frequent Item Sets},
  journal = {Journal of Statistical Software},
  year = {2005},
  volume = {14},
  pages = {1--25},
  number = {15},
  month = {October},
  abstract = {Mining frequent itemsets and association rules is a popular and well
	researched approach for discovering interesting relationships between
	variables in large databases. The R package arules presented in this
	paper provides a basic infrastructure for creating and manipulating
	input data sets and for analyzing the resulting itemsets and rules.
	The package also includes interfaces to two fast mining algorithms,
	the popular C implementations of Apriori and Eclat by Christian Borgelt.
	These algorithms can be used to mine frequent itemsets, maximal frequent
	itemsets, closed frequent itemsets and association rules.},
  issn = {1548-7660},
  pdfno = {http://michael.hahsler.net/research/arules_jss2005/v14i15.pdf},
  url = {http://dx.doi.org/10.18637/jss.v014.i15},
  doi = {10.18637/jss.v014.i15},
  category = {association rules}
}
@techreport{hahsler:Hahsler2005b,
  author = {Michael Hahsler and Kurt Hornik and Thomas Reutterer},
  title = {Implications of Probabilistic Data Modeling for Rule Mining},
  institution = {Research Report Series, Department of Statistics and Mathematics,
	Wirtschaftsuniversit{\"a}t Wien},
  year = {2005},
  type = {Report},
  number = {14},
  address = {Augasse 2--6, 1090 Wien, Austria},
  month = {March},
  abstract = { Mining association rules is an important technique for discovering
	meaningful patterns in transaction databases. In the current literature,
	the properties of algorithms to mine associations are discussed in
	great detail. In this paper we investigate properties of transaction
	data sets from a probabilistic point of view. We present a simple
	probabilistic framework for transaction data and its implementation
	using the R statistical computing environment. The framework can
	be used to simulate transaction data when no associations are present.
	We use such data to explore the ability to filter noise of confidence
	and lift, two popular interest measures used for rule mining. Based
	on the framework we develop the measure hyperlift and we compare
	this new measure to lift using simulated data and a real-world grocery
	database. },
  nopdf = {http://michael.hahsler.net/research/probDataMining_wp2005/hyperlift.pdf},
  url = {http://epub.wu.ac.at/764},
  category = {association rules}
}
@inproceedings{hahsler:Hahsler2005,
  author = {Michael Hahsler and Stefan Koch},
  title = {Discussion of a large-scale open source data collection methodology},
  booktitle = {38th Annual Hawaii International Conference on System Sciences (HICSS'05),
	January 3--6, 2005 Hilton Waikoloa Village, Big Island, Hawaii},
  year = {2005},
  publisher = {IEEE Computer Society Press},
  abstract = { In this paper we discusses in detail a possible methodology for collecting
	repository data on a large number of open source software projects
	from a single project hosting and community site. The process of
	data retrieval is described along with the possible metrics that
	can be computed and which can be used for further analyses. Example
	research areas to be addressed with the available data and first
	results are given. Then, both advantages and disadvantages of the
	proposed methodology are discussed together with implications for
	future approaches.},
  pdf = {http://michael.hahsler.net/research/oss_hicss2005/oss_hicss2005.pdf},
  url = {http://csdl.computer.org/comp/proceedings/hicss/2005/2268/07/22680197babs.htm},
  category = {software engineering}
}
@misc{hahsler:Fessler2003,
  author = {Georg Fessler and Michael Hahsler and Michaela Putz and Judith Schwarz
	and Brigitta Wiebogen},
  title = {{Projektbericht ePubWU 2001--2003}},
  howpublished = {Augasse 2--6, 1090 Wien, Wirtschaftsuniversit{\"a}t Wien},
  month = jan,
  year = {2004},
  abstract = {ePubWU ist eine elektronische Plattform f{\"u}r wissenschaftliche
	Publikationen der Wirtschaftsuniversit{\"a}t Wien, wo forschungsbezogene
	Ver{\"o}ffentlichungen der WU im Volltext {\"u}ber das WWW zug{\"a}nglich
	gemacht werden. ePubWU ist seit J{\"a}nner 2002 im Echtbetrieb und
	wird als Gemeinschaftsprojekt der Universit{\"a}tsbibliothek der
	Wirtschaftsuniversit{\"a}t Wien und der Abteilung f{\"u}r Informationswirtschaft
	betrieben. Dieser Bericht beinhaltet die Erfahrungen aus der 2-j{\"a}hrigen
	Pilotphase des Projekts.},
  address = {Augasse 2--6, 1090 Wien},
  institution = {Wirtschaftsuniversit{\"a}t Wien},
  pdf = {http://michael.hahsler.net/research/ePub_bericht_2004/ePub-Projektbericht_01-03.pdf},
  category = {digital libraries}
}
@techreport{hahsler:Hafner2004,
  author = {Susanne Hafner and Michael Hahsler},
  title = {{Preisvergleich zwischen Online-Shops und traditionellen Gesch{\"a}ften:
	Fallstudie Spieleeinzelhandel}},
  institution = {Working Papers on Information Processing and Information Management,
	Institut f{\"u}r Informationsverarbeitung und -wirtschaft, Wirtschaftsuniversit{\"a}t
	Wien},
  year = {2004},
  type = {Working Paper},
  number = {04/2004},
  address = {Augasse 2--6, 1090 Wien, Austria},
  month = aug,
  abstract = { Die vorliegende Arbeit besch{\"a}ftigt sich mit dem Preisvergleich
	zwischen Online-Shops und traditionellen Gesch{\"a}ften. In einigen
	Studien wurde bisher versucht Preisunterschiede zwischen online und
	traditionellen Gesch{\"a}ften nachzuweisen, um die These, dass Online-M{\"a}rkte
	aufgrund h{\"o}herer Transparenz und niedrigerer Transaktionskosten
	effizienter sind, zu best{\"a}tigen. Studien untersuchten bisher
	Produktgruppen wie CDs und B{\"u}cher. In dieser Studie besch{\"a}ftigen
	wir uns mit dem bisher noch nicht untersuchten Spieleeinzelhandel
	und konzentrieren uns dabei auf den {\"o}sterreichischen Markt. Es
	soll untersucht werden, ob der {\"o}sterreichische Markt {\"a}hnliche
	oder andere Ergebnisse liefert als die bisher untersuchten M{\"a}rkte
	(haupts{\"a}chlich im nordamerikanischer Raum). Die Untersuchung
	zeigt folgendes: Die Preise f{\"u}r Spiele sind im elektronischen
	Markt um ca. 20 Prozent niedriger als im traditionellen Markt. Die
	Preisstreuungen im elektronischen und traditionellen Markt unterscheiden
	sich nicht signifikant. Beide Ergebnisse decken sich mit den Ergebnissen
	anderer Studien. Damit ist der {\"o}sterreichische Online-Brettspieleinzelhandel
	{\"a}hnlich entwickelt wie der Online-Handel in anderen L{\"a}ndern
	und f{\"u}r andere Produktgruppen. },
  nopdf = {http://michael.hahsler.net/research/pricing_study_working2004/pricing_study_WP.pdf},
  url = {http://epub.wu.ac.at/828},
  category = {marketing}
}
@techreport{hahsler:Hahsler2004c,
  author = {Michael Hahsler},
  title = {A Model-Based Frequency Constraint for Mining Associations from Transaction
	Data},
  institution = {Working Papers on Information Processing and Information Management,
	Institut f{\"u}r Informationsverarbeitung und -wirtschaft, Wirtschaftsuniversit{\"a}t
	Wien},
  year = {2004},
  type = {Working Paper},
  number = {07/2004},
  address = {Augasse 2--6, 1090 Wien, Austria},
  month = nov,
  abstract = { In this paper we develop an alternative to minimum support which
	utilizes knowledge of the process which generates transaction data
	and allows for highly skewed frequency distributions. We apply a
	simple stochastic model (the NB model), which is known for its usefulness
	to describe item occurrences in transaction data, to develop a frequency
	constraint. This model-based frequency constraint is used together
	with a precision threshold to find individual support thresholds
	for groups of associations. We develop the notion of NB-frequent
	itemsets and present two mining algorithms which find all NB-frequent
	itemsets in a database. In experiments with publicly available transaction
	databases we show that the new constraint can provide significant
	improvements over a single minimum support threshold and that the
	precision threshold is easier to use. },
  nopdf = {http://michael.hahsler.net/research/nbd_working2004/nbd_associationrules_WP.pdf},
  url = {http://epub.wu.ac.at/1760},
  category = {association rules}
}
@inproceedings{hahsler:Hahsler2004b,
  author = {Michael Hahsler and Stefan Koch},
  title = {Cooperation and disruptive behaviour - Learning from a multi-player
	Internet gaming community},
  booktitle = {IADIS International Conference Web Based Communities 2004, Lisbon,
	Portugal, 24--26 March 2004},
  year = {2004},
  editor = {Piet Kommers and Pedro Isaias and Miguel Baptista Nunes},
  pages = {35--42},
  publisher = {International Association for Development of the Information Society
	(IADIS)},
  abstract = { In this paper we report possibilities and experiences from employing
	Counter-Strike, a popular multi-player Internet computer game and
	its resulting online community in research on cooperative behaviour.
	Advantages from using this game include easy availability of rich
	data, the emphasis on team-playing, as well as numerous possibilities
	to change the experiment settings. We use descriptive game theory
	and statistical methods to explore cooperation within the game as
	well as the way the player community deals with disruptive behaviour.
	After a quick introduction to the basic rules of Counter-Strike,
	we describe the setup of the Internet game server used. We then present
	empirical results from the game server logs where cooperation within
	the game is analyzed from a game theoretic perspective. Finally we
	discuss the applications of our results to other online communities,
	including cooperation and self-regulation in open source teams.},
  pdf = {http://michael.hahsler.net/research/webBasedComm_cs/webBasedComm_cs.pdf},
  category = {software engineering}
}
@inproceedings{hahsler:Bernroider2003a,
  author = {Edward Bernroider and Michael Hahsler and Stefan Koch and Volker
	Stix},
  title = {{Data Envelopment Analysis zur Unterst{\"u}tzung der Auswahl und
	Einf{\"u}hrung von ERP-Systemen}},
  booktitle = {Informationswirtschaft: Ein Sektor mit Zukunft, Symposium 4.--5.
	September 2003, Wien, {{\"O}}sterreich},
  year = {2003},
  editor = {Andreas Geyer-Schulz and Alfred Taudes },
  series = {Lecture Notes in Informatics (LNI) P-33},
  pages = {11--26},
  publisher = {Gesellschaft f{\"u}r Informatik},
  abstract = {Immer mehr Unternehmen setzen betriebswirtschaftliche Standardsoftwarepakete
	wie beispielsweise SAP R/3 oder BaaN ein. Die Auswahl und die Einf{\"u}hrung
	solcher Systeme stellt f{\"u}r die meisten Unternehmen ein strategisch
	wichtiges IT-Projekt dar, das mit massiven Risiken verbunden ist.
	Bei der Auswahl des am besten geeigneten Systems gilt es einen Gruppenentscheidungsprozess
	zu unterst{\"u}tzen. Das darauf folgende Einf{\"u}hrungsprojekt muss
	effizient, den ''best practices'' entsprechend, durchgef{\"u}hrt
	werden. In dieser Arbeit wird anhand von Beispielen aufgezeigt, wie
	beide Prozesse - die Auswahl und die Einf{\"u}hrung - durch die Data
	Envelopment Analysis unterst{\"u}tzt werden k\"onnen.},
  url = {http://www.gi-ev.de/},
  category = {marketing}
}
@incollection{hahsler:Geyer-Schulz2003e,
  author = {Andreas Geyer-Schulz and Michael Hahsler},
  title = {Comparing two Recommender Algorithms with the Help of Recommendations
	by Peers},
  booktitle = {WEBKDD 2002 - Mining Web Data for Discovering Usage Patterns and
	Profiles 4th International Workshop, Edmonton, Canada, July 2002,
	Revised Papers},
  publisher = {Springer-Verlag},
  year = {2003},
  editor = {O.R. Zaiane and J. Srivastava and M. Spiliopoulou and B. Masand},
  series = {Lecture Notes in Computer Science LNAI 2703},
  pages = {137--158},
  abstract = {Since more and more Web sites, especially sites of retailers, offer
	automatic recommendation services using Web usage mining, evaluation
	of recommender algorithms has become increasingly important. In this
	paper we present a framework for the evaluation of different aspects
	of recommender systems based on the process of discovering knowledge
	in databases introduced by Fayyad et al. and we summarize research
	already done in this area. One aspect identified in the presented
	evaluation framework is widely neglected when dealing with recommender
	algorithms. This aspect is to evaluate how useful patterns extracted
	by recommender algorithms are to support the social process of recommending
	products to others, a process normally driven by recommendations
	by peers or experts. To fill this gap for recommender algorithms
	based on frequent itemsets extracted from usage data we evaluate
	the usefulness of two algorithms. The first recommender algorithm
	uses association rules, and the other algorithm is based on the repeat-buying
	theory known from marketing research. We use 6 months of usage data
	from an educational Internet information broker and compare useful
	recommendations identified by users from the target group of the
	broker (peers) with the recommendations produced by the algorithms.
	The results of the evaluation presented in this paper suggest that
	frequent itemsets from usage histories match the concept of useful
	recommendations expressed by peers with satisfactory accuracy (higher
	than 70\%) and precision (between 60\% and 90\%). Also the evaluation
	suggests that both algorithms studied in the paper perform similar
	on real-world data if they are tuned properly.},
  note = {(Revised version of the WEBKDD 2002 paper ``Evaluation of Recommender
          Algorithms for an Internet Information Broker based on Simple
          Association Rules and on the Repeat-Buying Theory'')},
  pdf = {http://michael.hahsler.net/research/recomm_lnai2002/lnai2002.pdf},
  url = {http://www.springeronline.com/sgw/cda/frontpage/0,10735,5-146-22-14095354-0,00.html},
  category = {recommender systems, association rules}
}
@incollection{hahsler:GeyerSchulz2003c,
  author = {Andreas Geyer-Schulz and Michael Hahsler and Andreas Neumann and
	Anke Thede},
  title = {Behavior-Based Recommender Systems as Value-Added Services for Scientific
	Libraries},
  booktitle = {Statistical Data Mining \& Knowledge Discovery},
  publisher = {Chapman \& Hall / CRC},
  year = {2003},
  editor = {Hamparsum Bozdogan},
  pages = {433--454},
  month = jul,
  abstract = { Amazon.com paved the way for several large-scale, behavior-based
	recommendation services as an important value-added expert advice
	service for online book shops. In this contribution we discuss the
	effects (and possible reductions of transaction costs) for such services
	and investigate how such value-added services can be implemented
	in context of scientific libraries. For this purpose we present a
	new, recently developed recommender system based on a stochastic
	purchase incidence model, present the underlying stochastic model
	from repeat-buying theory and analyze whether the underlying assumptions
	on consumer behavior holds for users of scientific libraries, too.
	We analyzed the logfiles with approximately 85 million HTTP-transactions
	of the web-based online public access catalog (OPAC) of the library
	of the Universit{\"a}t Karlsruhe (TH) since January 2001 and performed
	some diagnostic checks. The recommender service is fully operational
	within the library system of the Universit{\"a}t Karlsruhe (TH) since
	2002/06/22. },
  pdf = {http://michael.hahsler.net/research/knoxville_2002/knoxville.pdf},
  category = {recommender systems}
}
@inproceedings{hahsler:GeyerSchulz2003d,
  author = {Andreas Geyer-Schulz and Michael Hahsler and Andreas Neumann and
	Anke Thede},
  title = {{Recommenderdienste f{\"u}r wissenschaftliche Bibliotheken und Bibliotheksverb{\"u}nde}},
  booktitle = {Informationswirtschaft: Ein Sektor mit Zukunft, Symposium 4.--5.
	September 2003, Wien, {{\"O}}sterreich},
  year = {2003},
  editor = {Andreas Geyer-Schulz and Alfred Taudes },
  series = {Lecture Notes in Informatics (LNI) P-33},
  pages = {43--58},
  publisher = {Gesellschaft f{\"u}r Informatik},
  abstract = {Wissenschaftliche Bibliotheken stellen ein vielversprechendes Anwendungsfeld
	f{\"u}r Recommenderdienste dar. Wissenschaftliche Bibliotheken k{\"o}nnen
	leicht kundenzentrierte Serviceportale im Stil von amazon.com entwickeln.
	Studenten, Universit{\"a}tslehrer und -forscher k{\"o}nnen ihren
	Anteil an den Transaktionskosten (z.B. Such- und Bewertungskosten
	f{\"u}r Informationsprodukte) reduzieren. F{\"u}r Bibliothekare liegt
	der Vorteil in einer Verbesserung der Kundenberatung durch Empfehlungen
	und einer zus{\"a}tzlichen Unterst{\"u}tzung bei der Marktforschung,
	Produktbewertung und dem Bestandsmanagement. In diesem Beitrag pr{\"a}sentieren
	wir eine Strategie, mit der verhaltensbasierte, verteilte Recommenderdienste
	in bestehende Bibliothekssysteme mit minimalem Aufwand integriert
	werden k{\"o}nnen und berichten {\"u}ber unsere Erfahrungen bei der
	Einf{\"u}hrung eines solchen Dienstes an der Universit{\"a}tsbibliothek
	der Universit{\"a}t Karlsruhe (TH).},
  url = {http://www.gi-ev.de/},
  category = {recommender systems}
}
@inproceedings{hahsler:GeyerSchulz2003a,
  author = {Andreas Geyer-Schulz and Michael Hahsler and Andreas Neumann and
	Anke Thede},
  title = {An Integration Strategy for Distributed Recommender Services in Legacy
	Library Systems},
  booktitle = {Between Data Science and Applied Data Analysis, Proceedings of the
	26th Annual Conference of the Gesellschaft f{\"u}r Klassifikation
	e.V., University of Mannheim, July 22--24, 2002},
  year = {2003},
  editor = {M. Schader and W. Gaul and M. Vichi},
  series = {Studies in Classification, Data Analysis, and Knowledge Organization},
  pages = {412--420},
  month = jul,
  publisher = {Springer-Verlag},
  abstract = { Scientific library systems are a very promising application area
	for recommender services. Scientific libraries could easily develop
	customer-oriented service portals in the style of amazon.com. Students,
	university teachers and researchers can reduce their transaction
	cost (i.e. search and evaluation cost of information products). For
	librarians, the advantage is an improvement of the customer support
	by recommendations and the additional support in marketing research,
	product evaluation, and book selection. In this contribution we present
	a strategy for integrating a behavior-based distributed recommender
	service in legacy library systems with minimal changes in the legacy
	system. },
  url = {http://www.springer.com/east/home/business/business+information+systems?SGWID=5-170-69-173622621-0},
  category = {recommender systems}
}
@inproceedings{hahsler:GeyerSchulz2003b,
  author = {Andreas Geyer-Schulz and Michael Hahsler and Anke Thede},
  title = {Comparing association-rules and repeat-buying based recommender systems
	in a {B2B} environment},
  booktitle = {Between Data Science and Applied Data Analysis, Proceedings of the
	26th Annual Conference of the Gesellschaft f{\"u}r Klassifikation
	e.V., University of Mannheim, July 22--24, 2002},
  year = {2003},
  editor = {M. Schader and W. Gaul and M. Vichi},
  series = {Studies in Classification, Data Analysis, and Knowledge Organization},
  pages = {421--429},
  month = jul,
  publisher = {Springer-Verlag},
  abstract = { In this contribution we present a systematic evaluation and comparison
	of recommender systems based on simple association rules and on repeat-buying
	theory. Both recommender services are based on the customer purchase
	histories of a medium-sized B2B-merchant for computer accessories.
	With the help of product managers an evaluation set for recommendations
	was generated. With regard to this evaluation set, recommendations
	produced by both methods are evaluated and several error measures
	are computed. This provides an empirical test whether frequent item
	sets or outliers of a stochastic purchase incidence model are suitable
	concepts for automatically generation recommendations. Furthermore,
	the loss function (performance measures) of the two models are compared
	and the sensitivity with regard to a misspecification of the model
	parameters is discussed. },
  url = {http://www.springerlink.com/content/978-3-540-20304-9/},
  category = {recommender systems}
}
@techreport{hahsler:Hahsler2003,
  author = {Michael Hahsler},
  title = {A Quantitative Study of the Application of Design Patterns in Java},
  institution = {Working Papers on Information Processing and Information Management,
	Institut f{\"u}r Informationsverarbeitung und -wirtschaft, Wirtschaftsuniversit{\"a}t
	Wien},
  year = {2003},
  type = {Working Paper},
  number = {01/2003},
  address = {Augasse 2--6, 1090 Wien, Austria},
  month = jan,
  abstract = { Using design patterns is a widely accepted method to improve software
	development. There are many benefits of the application of patterns
	claimed in the literature. The most cited claim is that design patterns
	can provide a common design vocabulary and therefore improve greatly
	communication between software designers. Most of the claims are
	supported by experiences reports of practitioners, but there is a
	lack of quantitative research concerning the actual application of
	design patterns and about the realization of the claimed benefits.
	In this paper we analyze the development process of over 1000 open
	source software projects using version control information. We explore
	this information to gain an insight into the differences of software
	development with and without design patterns. By analyzing these
	differences we provide evidence that design patterns are used for
	communication and that there is a significant difference between
	developers who use design patterns and who do not. },
  html = {http://michael.hahsler.net/research/patterns_working2003/designpatterns_java.html},
  nopdf = {http://michael.hahsler.net/research/patterns_working2003/designpatterns_java.pdf},
  url = {http://epub.wu.ac.at/1646},
  category = {software engineering}
}
@article{hahsler:Hahsler2003b,
  author = {Michael Hahsler},
  title = {Integrating Digital Document Acquisition into a University Library:
	A Case Study of Social and Organizational Challenges},
  journal = {Journal of Digital Information Management},
  year = {2003},
  volume = {1},
  pages = {162--171},
  number = {4},
  month = dec,
  abstract = {In this article we report on the effort of the university library
	of the Vienna University of Economics and Business Administration
	to integrate a digital library component for research documents authored
	at the university into the existing library infrastructure. Setting
	up a digital library has become a relatively easy task using the
	current data base technology and the components and tools freely
	available. However, to integrate such a digital library into existing
	library systems and to adapt existing document acquisition work-flows
	in the organization are non-trivial tasks. We use a research frame
	work to identify the key players in this change process and to analyze
	their incentive structures. Then we describe the light-weight integration
	approach employed by our university and show how it provides incentives
	to the key players and at the same time requires only minimal adaptation
	of the organization in terms of changing existing work-flows. Our
	experience suggests that this light-weight integration offers a cost
	efficient and low risk intermediate step towards switching to exclusive
	digital document acquisition.},
  issn = {0972-7272},
  pdf = {http://michael.hahsler.net/research/ePub_jdim2003/IntegratingDDAcquisition_final.pdf},
  url = {http://www.dirf.org/jdim/v1i4.htm},
  category = {digital libraries}
}
@inproceedings{hahsler:GeyerSchulz2002,
  author = {Walter B{\"o}hm and Andreas Geyer-Schulz and Michael Hahsler and
	Maximillian Jahn},
  title = {Repeat Buying Theory and its Application for Recommender Services},
  booktitle = {{Exploratory Data Analysis in Empirical Research, Proceedings of
	the 25th Annual Conference of the Gesellschaft f{\"u}r Klassifikation
	e.V., University of Munich, March 14--16, 2001}},
  year = {2002},
  editor = {O. Opitz and M. Schwaiger},
  pages = {229--239},
  publisher = {Springer-Verlag},
  abstract = {In the context of a virtual university's information broker we study
	the consumption patterns for information goods and we investigate
	if Ehrenberg's repeat-buying theory which successfully models regularities
	in a large number of consumer product markets can be applied in electronic
	markets for information goods too. First results indicate that Ehrenberg's
	repeat-buying theory succeeds in describing the consumption patterns
	of bundles of complementary information goods reasonably well and
	that this can be exploited for automatically generating anonymous
	recommendation services based on such information bundles. An experimental
	anonymous recommender service has been implemented and is currently
	evaluated in the Virtual University of the Vienna University of Economics
	and Business Administration at http://vu.wu-wien.ac.at.},
  pdf = {http://michael.hahsler.net/research/recomm_gfkl2001/gfkl2001.pdf},
  url = {http://www.springer.com/east/home/business/business+information+systems?SGWID=5-170-69-173622621-0},
  category = {recommender systems}
}
@article{hahsler:GeyerSchulz2002a,
  author = {Wolfgang Gaul and Andreas Geyer-Schulz and Michael Hahsler and Lars
	Schmidt-Thieme},
  title = {{eMarketing mittels Recommendersystemen}},
  journal = {{Marketing ZFP}},
  year = {2002},
  volume = {24},
  pages = {47--55},
  abstract = {Recommendersysteme liefern einen wichtigen Beitrag f{\"u}r die Ausgestaltung
	von eMarketing Aktivit{\"a}ten. Ausgehend von einer Diskussion von
	Input/Output Charakteristika zur Beschreibung solcher Systeme, die
	bereits eine geeignete Unterscheidung praxisrelevanter Erscheinungsformen
	erlauben, wird motiviert, warum eine solche Charakterisierung durch
	die Einbeziehung methodischer Aspekte aus der Marketing Forschung
	angereichert werden muss. Ein auf der Theorie des Wiederkaufverhaltens
	basierendes Recommendersystem sowie ein System, das Empfehlungen
	mittels Analyse des Navigationsverhaltens von Site Besuchern erzeugt,
	werden vorgestellt. Am Beispiel der Amazon Site werden die Marketing
	M{\"o}glichkeiten von Recommendersystemen verdeutlicht. Abschlie{\ss}end
	wird zur Abrundung auf weitere Literatur mit Recommendersystem Bezug
	eingegangen. In einem Ausblick werden Hinweise gegeben, in welche
	Richtungen Weiterentwicklungen geplant sind.},
  series = {Spezialausgabe ''E-Marketing''},
  url = {http://www.jstor.org/stable/42746129},
  category = {recommender systems, marketing}
}
@inproceedings{hahsler:GeyerSchulz2002d,
  author = {Andreas Geyer-Schulz and Michael Hahsler},
  title = {Evaluation of Recommender Algorithms for an Internet Information
	Broker based on Simple Association Rules and on the Repeat-Buying
	Theory},
  booktitle = {Fourth WEBKDD Workshop: Web Mining for Usage Patterns \& User Profiles},
  year = {2002},
  editor = {Brij Masand and Myra Spiliopoulou and Jaideep Srivastava and Osmar
	R. Zaiane},
  pages = {100--114},
  address = {Edmonton, Canada},
  month = jul,
  abstract = {Association rules are a widely used technique to generate recommendations
	in commercial and research recommender systems. Since more and more
	Web sites, especially of retailers, offer automatic recommender services
	using Web usage mining, evaluation of recommender algorithms becomes
	increasingly important. In this paper we first present a framework
	for the evaluation of different aspects of recommender systems based
	on the process of discovering knowledge in databases of Fayyad et
	al. and then we focus on the comparison of the performance of two
	recommender algorithms based on frequent itemsets. The first recommender
	algorithm uses association rules, and the other recommender algorithm
	is based on the repeat-buying theory known from marketing research.
	For the evaluation we concentrated on how well the patterns extracted
	from usage data match the concept of useful recommendations of users.
	We use 6 month of usage data from an educational Internet information
	broker and compare useful recommendations identified by users from
	the target group of the broker with the results of the recommender
	algorithms. The results of the evaluation presented in this paper
	suggest that frequent itemsets from purchase histories match the
	concept of useful recommendations expressed by users with satisfactory
	accuracy (higher than 70\%) and precision (between 60\% and 90\%).
	Also the evaluation suggests that both algorithms studied in the
	paper perform similar on real-world data if they are tuned properly.},
  pdf = {http://michael.hahsler.net/research/recomm_webkdd2002/final/webkdd2002.pdf},
  category = {recommender systems, association rules}
}
@inproceedings{hahsler:GeyerSchulz2002c,
  author = {Andreas Geyer-Schulz and Michael Hahsler},
  title = {Software Reuse with Analysis Patterns},
  booktitle = {Proceedings of the 8th AMCIS},
  year = {2002},
  pages = {1156--1165},
  address = {Dallas, TX},
  month = aug,
  publisher = {Association for Information Systems},
  abstract = {The purpose of this article is to promote reuse of domain knowledge
	by introducing patterns already in the analysis phase of the software
	life-cycle. We propose an outline template for analysis patterns
	that strongly supports the whole analysis process from the requirements
	analysis to the analysis model and further on to its transformation
	into a flexible and reusable design and implementation. As an example
	we develop a family of analysis patterns in this paper that deal
	with a series of pressing problems in cooperative work, collaborative
	information filtering and sharing, and knowledge management. We evaluate
	the reuse potential of these patterns by analyzing several components
	of an information system, that was developed for the Virtual University
	project of the Vienna University of Economics and Business Administration.
	The findings of this analysis suggest that using patterns in the
	analysis phase has the potential to reducing development time significantly
	by introducing reuse already at the analysis stage and by improving
	the interface between analysis and design phase.},
  pdf = {http://michael.hahsler.net/research/virlib_AMCIS2002/virlib_amcis2002.pdf},
  category = {software engineering}
}
@inproceedings{hahsler:GeyerSchulz2001,
  author = {Andreas Geyer-Schulz and Michael Hahsler and Maximillian Jahn},
  title = {Recommendations for Virtual Universities from Observed User Behavior},
  booktitle = {Classification, Automation, and New Media, Proceedings of the 24th
	Annual Conference of the Gesellschaft f{\"u}r Klassifikation e.V.,
	University of Passau, March 15--17, 2000 },
  year = {2002},
  editor = {W. Gaul and G. Ritter},
  pages = {273--280},
  publisher = {Springer-Verlag},
  abstract = { Recently recommender systems started to gain ground in commercial
	Web-applications. For example, the online-bookseller {\em amazon.com}
	recommends his customers books similar to the ones they bought using
	the analysis of observed purchase behavior of consumers. In this
	article we describe a generic architecture for recommender services
	for information markets which has been implemented in the setting
	of the Virtual University of the Vienna University of Economics and
	Business Administration (http://vu.wu-wien.ac.at). The architecture
	of a recommender service is defined as an agency of interacting software
	agents. It consists of three layers, namely the meta-data management
	system, the broker management system and the business-to-customer
	interface.},
  pdf = {http://michael.hahsler.net/research/recomm_gfkl2000/paper.pdf},
  url = {http://www.springer.com/east/home/business/business+information+systems?SGWID=5-170-69-173622621-0},
  category = {recommender systems}
}
@incollection{hahsler:GeyerSchulz2002b,
  author = {Andreas Geyer-Schulz and Michael Hahsler and Maximillian Jahn},
  title = {A Customer Purchase Incidence Model Applied to Recommender Systems},
  booktitle = {WEBKDD 2001 - Mining Log Data Across All Customer Touch Points, Third
	International Workshop, San Francisco, CA, USA, August 26, 2001,
	Revised Papers},
  publisher = {Springer-Verlag},
  year = {2002},
  editor = {R. Kohavi and B.M. Masand and M. Spiliopoulou and J. Srivastava},
  series = {Lecture Notes in Computer Science LNAI 2356},
  pages = {25--47},
  month = jul,
  abstract = {In this contribution we transfer a customer purchase incidence model
	for consumer products which is based on Ehrenberg s repeat-buying
	theory to Web-based information products. Ehrenberg s repeat-buying
	theory successfully describes regularities on a large number of consumer
	product markets. We show that these regularities exist in electronic
	markets for information goods, too, and that purchase incidence models
	provide a well founded theoretical base for re-commender and alert
	services. The article consists of two parts. In the first part Ehrenberg
	s repeat-buying theory and its assumptions are reviewed and adapted
	for web-based information markets. Second, we present the empirical
	validation of the model based on data collected from the information
	market of the Virtual University of the Vienna University of Economics
	and Business Administration from September 1999 to May 2001.},
  note = {(Revised version of the WEBKDD 2001 paper ``A Customer Purchase 
          Incidence Model Applied to Recommender Systems'')},
  pdf = {http://michael.hahsler.net/research/recomm_lncs2001/lncswebkdd2001a/lncswebkdd2001a.pdf},
  url = {http://www.springerlink.com/content/mb2rqan13gy9/},
  category = {recommender systems}
}
@techreport{hahsler:GeyerSchulz2001d,
  author = {Andreas Geyer-Schulz and Michael Hahsler},
  title = {Software Engineering with Analysis Patterns},
  institution = {Working Papers on Information Processing and Information Management,
	Institut f{\"u}r Informationsverarbeitung und -wirtschaft, Wirtschaftsuniversit{\"a}t
	Wien},
  year = {2001},
  type = {Working Paper},
  number = {01/2001},
  address = {Augasse 2--6, 1090 Wien, Austria},
  month = nov,
  abstract = { The purpose of this article is twofold, first to promote the use
	of patterns in the analysis phase of the software life-cycle by proposing
	an outline template for analysis patterns that strongly supports
	the whole analysis process from the requirements analysis to the
	analysis model and further on to its transformation into a flexible
	design. Second we present, as an example, a family of analysis patterns
	that deal with a series of pressing problems in cooperative work,
	collaborative information filtering and sharing, and knowledge management.
	We present the step-by-step evolution of the analysis pattern virtual
	library with active agents starting with a simple pinboard. },
  nopdf = {http://michael.hahsler.net/research/virlib_working2001/virlib.pdf},
  url = {http://epub.wu.ac.at/592},
  category = {software engineering}
}
@inproceedings{hahsler:GeyerSchulz2001e,
  author = {Andreas Geyer-Schulz and Michael Hahsler and Maximillian Jahn},
  title = {{Wissenschaftliche Recommendersysteme in Virtuellen Universit{\"a}ten}},
  booktitle = {Unternehmen Hochschule},
  year = {2001},
  editor = {H.-J. Appelrath and R. Beyer and U. Marquardt and H.C. Mayr and C.
	Steinberger},
  address = {Wien, {\"O}sterreich},
  month = sep,
  note = {Symposium UH2001, GI Lecture Notes in Informatics (LNI)},
  abstract = { In diesem Beitrag wird die Rolle von Recommendersystemen und ihr
	Potential in der Lehr-, Lern- und Forschungsumgebung einer Virtuellen
	Universit{\"a}t untersucht.Die Hauptidee dieses Beitrags besteht
	darin, die Informationsaggregationsf{\"a}higkeiten von Recommendersystemen
	in einer Virtuellen Universit{\"a}t auszunutzen, um Tutoren-und Beratungsdienste
	in einer Virtuellen Universit{\"a}t automatisch zu verbessern, um
	damit Betreuung und Beratung von Studierenden zu personalisieren
	und f{\"u}r eine gr{\"o}{\ss}ere Anzahl von Teilnehmern bei gleichzeitiger
	Entlastung der Lehrenden verf{\"u}gbar zu machen. Im zweiten Teil
	dieses Beitrags werden die Recommenderdienste von myVU, der Sammlung
	der personalisierten Dienste der Virtuellen Universit{\"a}t (VU)
	der Wirtschaftsuniversit{\"a}t Wien und ihre nicht-personalisierten
	Variantenbeschrieben, die im Wesentlichen auf beobachtetem Benutzerverhalten
	und, in der personalisierten Variante, zus{\"a}tzlich auf Selbstselektion
	durch Selbsteinsch{\"a}tzung der Erfahrung in einem Fachgebiet beruhen.
	Abschlie{\ss}end wird noch der innovative Einsatz solcher Systeme diskutiert
	und an einigen Szenarien beschrieben. },
  pdf = {http://michael.hahsler.net/research/unternehmenhochschule2001/uh2001.pdf},
  url = {http://www.gi-ev.de/},
  category = {recommender systems}
}
@article{hahsler:GeyerSchulz2001b,
  author = {Andreas Geyer-Schulz and Michael Hahsler and Maximillian Jahn},
  title = {Educational and Scientific Recommender Systems: Designing the Information
	Channels of the Virtual University},
  journal = {International Journal of Engineering Education},
  year = {2001},
  volume = {17},
  pages = {153--163},
  number = {2},
  abstract = {In this article we investigate the role of recommender systems and
	their potential in the educational and scientific environment of
	a Virtual University. The key idea is to use the information aggregation
	capabilities of a recommender system to improve the tutoring and
	consulting services of a Virtual University in an automated way and
	thus scale tutoring and consulting in a personalized way to a mass
	audience. We describe the recommender services of myVU, the collection
	of the personalized services of the Virtual University (VU) of the
	Vienna University of Economics and Business Administration which
	are based on observed user behavior and self assignment of experience
	which are currently field-tested. We show, how the usual mechanism
	design problems inherent to recommender systems are addressed in
	this prototype.},
  issn = {0949-149X},
  pdf = {http://michael.hahsler.net/research/recomm_ijee2001/paper.pdf},
  series = {Special Issue on Virtual Universities},
  category = {recommender systems}
}
@inproceedings{hahsler:GeyerSchulz2001c,
  author = {Andreas Geyer-Schulz and Michael Hahsler and Maximillian Jahn},
  title = {A Customer Purchase Incidence Model Applied to Recommender Systems},
  booktitle = {WEBKDD2001 Workshop: Mining Log Data Across All Customer TouchPoints},
  year = {2001},
  pages = {35--45},
  address = {San Francisco, CA},
  month = aug,
  abstract = {In this contribution we transfer a customer purchase incidence model
	for consumer products which is based on Ehrenberg's repeat-buying
	theory to Web-based information products. Ehrenberg's repeat-buying
	theory successfully describes regularities in a large number of consumer
	product markets. We show that these regularities exist in electronic
	markets for information goods too, and that purchase incidence models
	provide a well founded theoretical foundation for recommender and
	alert systems. The article consists of three parts. First, we present
	the architecture of an information market and its instrumentation
	for collecting data on customer behavior. In the second part Ehrenberg's
	repeat-buying theory and its assumptions are reviewed and adapted
	for Web-based information markets. Finally, we present the empirical
	validation of the model based on data collected from the information
	market of the Virtual University of the Vienna University of Economics
	and Business Administration at http://vu.wu-wien.ac.at },
  pdf = {http://michael.hahsler.net/research/recomm_webKDD2001/paper/geyerschulz.pdf},
  category = {recommender systems}
}
@phdthesis{hahsler:Hahsler2001,
  author = {Michael Hahsler},
  title = {Analyse Patterns im Softwareentwicklungsproze{\ss} mit Beispielen f{\"u}r
	Informationsmanagement und deren Anwendungen f{\"u}r die Virtuellen
	Universit{\"a}t der Wirtschaftsuniversit{\"a}t Wien},
  school = {Wirtschaftsuniversit{\"a}t Wien},
  year = {2001},
  type = {Dissertation},
  address = {Augasse 2--6, A 1090 Wien, {\"O}sterreich},
  month = jan,
  abstract = {Diese Arbeit besch{\"a}ftigt sich mit Analyse Patterns, der Anwendung
	von Patterns in der Analysephase der Softwareentwicklung. In der
	Designphase werden Patterns seit einigen Jahren eingesetzt, um Expertenwissen
	und Wiederverwendbarkeit in den Designproze{\ss} einflie{\ss}en zu lassen.
	Es existiert bereits eine F{\"u}lle an solchen Design Patterns. Die
	Analysephase ist ein neuer Anwendungsbereich f{\"u}r Patterns, der
	bisher in der Literatur noch nicht ausreichend behandelt wurde. In
	dieser Arbeit wird die Anwendung des Pattern-Ansatzes in der Analysephase
	aufgearbeitet und konkretisiert. Analyse Patterns unterst{\"u}tzen
	den gesamten Softwareentwicklungsproze{\ss} und helfen bekannte Probleme
	w{\"a}hrend der Analysephase zu l{\"o}sen. Dadurch k{\"o}nnen Zeit
	und Kosten bei der Entwicklung neuer Softwaresysteme eingespart werden.
	Diese Eigenschaften von Analyse Patterns werden anhand konkreter
	Beispiele in einer Case Study nachgewiesen. Diese Case Study beschreibt
	den Einsatz von in dieser Arbeit entwickelten Analyse Pattern f{\"u}r
	Informationsmanagement anhand des Projekts Virtuelle Universit{\"a}t
	der Wirtschaftsuniversit{\"a}t Wien, in dem ein Internet-Informationsbroker
	zur Unterst{\"u}tzung von Lehre und Forschung realisiert wird. Die
	Erfahrungen aus diesem Projekt werden untersucht, und die Auswirkungen
	der Analyse Patterns auf Wiederverwendung bei der Softwareentwicklung
	und auf die Akzeptanz des resultierenden Systems werden pr{\"a}sentiert.},
  pdf = {http://michael.hahsler.net/research/diss/diss.pdf},
  url = {http://epub.wu.ac.at/1866},
  category = {software engineering}
}
@inproceedings{hahsler:GeyerSchulz2000,
  author = {Andreas Geyer-Schulz and Michael Hahsler},
  title = {Automatic Labelling of References for Information Systems},
  booktitle = {Classification and Information Processing at the Turn of the Millennium,
	Proceedings of the 23rd Annual Conference of the Gesellschaft f{\"u}r
	Klassifikation e.V., University of Bielefeld, March 10--12, 1999},
  year = {2000},
  editor = {Reinhold Decker and Wolfgang Gaul},
  series = {Studies in Classification, Data Analysis, and Knowledge Organization},
  pages = {451--459},
  publisher = {Springer-Verlag},
  abstract = {Today users of Internet information services like e.g. Yahoo! or AltaVista
	often experience high search costs. One important reason for this
	is the necessity to browse long reference lists manually, because
	of the well-known problems of relevance ranking. A possible remedy
	is to complement the references with automatically generated labels
	which provide valuable information about the referenced information
	source. Presenting suitably labelled lists of references to users
	aims at improving the clarity and thus comprehensibility of the information
	offered and at reducing the search cost. In the following we survey
	several dimensions for labelling (time, frequency of usage, region,
	language, subject, industry, and preferences) and the corresponding
	classification problems. To solve these problems automatically we
	sketch for each problem a pragmatic mix of machine learning methods
	and report selected results.},
  pdf = {http://michael.hahsler.net/research/labeling_gfkl1999/paper/labelling.pdf},
  url = {http://www.springer.com/east/home/business/business+information+systems?SGWID=5-170-69-173622621-0},
  category = {recommender systems}
}
@inproceedings{hahsler:GeyerSchulz2000c,
  author = {Andreas Geyer-Schulz and Michael Hahsler},
  title = {{Lebenslanges virtuelles Lernen}},
  booktitle = {{Europas Arbeitswelt von Morgen}},
  year = {2000},
  editor = {Franciszek Grucza},
  pages = {51--54},
  address = {Wien},
  publisher = {Wiener Zentrum der Polnischen Akademie der Wissenschaften}
}
@incollection{hahsler:GeyerSchulz2000a,
  author = {Andreas Geyer-Schulz and Michael Hahsler and Maximillian Jahn},
  title = {myVU: A Next Generation Recommender System Based on Observed Consumer
	Behavior and Interactive Evolutionary Algorithms},
  booktitle = {Data Analysis: Scientific Modeling and Practical Applications},
  publisher = {Springer Verlag},
  year = {2000},
  editor = {Wolfgang Gaul and Otto Opitz and Martin Schader},
  series = {Studies in Classification, Data Analysis, and Knowledge Organization},
  pages = {447--457},
  address = {Heidelberg, Germany},
  abstract = {myVU is a next generation recommender system based on observed consumer
	behavior and interactive evolutionary algorithms implementing customer
	relationship management and one-to-one marketing in the educational
	and scientific broker system of a virtual university. myVU provides
	a personalized, adaptive WWW-based user interface for all members
	of a virtual university and it delivers routine recommendations for
	frequently used scientific and educational Web-sites.},
  pdf = {http://michael.hahsler.net/research/festschrift2000/paper.pdf},
  url = {http://www.springer.com/east/home/business/business+information+systems?SGWID=5-170-69-173622621-0},
  category = {recommender systems}
}
@inproceedings{hahsler:Hahsler2000,
  author = {Michael Hahsler and Bernd Simon},
  title = {User-centered Navigation Re-Design for Web-based Information Systems},
  booktitle = {Proceedings of the Sixth Americas Conference on Information Systems
	(AMCIS 2000)},
  year = {2000},
  editor = {H. Michael Chung},
  pages = {192--198},
  address = {Long Beach, CA},
  publisher = {Association for Information Systems},
  abstract = {Navigation design for web-based information systems (e.g. e-commerce
	sites, intranet solutions) that ignores user-participation reduces
	the system's value and can even lead to system failure. In this paper
	we introduce a user-centered, explorative approach to re-designing
	navigation structures of web-based information systems, and describe
	how it can be implemented in order to provide flexibility and reduce
	maintenance costs. We conclude with lessons learned from the navigation
	re-design project at the Vienna University of Economics and Business
	Administration.},
  pdf = {http://michael.hahsler.net/research/webdesign_amcis2000/TT04-11_final.pdf},
  url = {http://aisel.isworld.org/article_by_author.asp?Author_ID=86},
  category = {marketing}
}
@inproceedings{hahsler:GeyerSchulz1999b,
  author = {Andreas Geyer-Schulz and Michael Hahsler and Georg Schneider},
  title = {The Virtual University as a Network Economy},
  booktitle = {Informatik '99, Unternehmen Hochschule '99, Workshop-Unterlagen},
  year = {1999},
  editor = {Heinrich C. Mayr and Claudia Steinberger and Hans-J{\"u}rgen Appelrath
	and Uwe Marquardt},
  pages = {75--86},
  address = {Bielefeld, Germany},
  month = oct
}
@article{hahsler:GeyerSchulz1999,
  author = {Andreas Geyer-Schulz and Michael Hahsler and Georg Schneider},
  title = {The Virtual University and Its Embedded Agents},
  journal = {{\"O}{G}{A}{I}{} Journal},
  year = {1999},
  volume = {18},
  pages = {14--19},
  number = {1},
  abstract = {In this article we present the current state of usage of (intelligent)
	Internet agents in the Virtual University (VU) of the Vienna University
	of Economics and BA. We discuss opportunities and challenges for
	the development of several classes of agents and their sensor systems.
	More specifically, agents of the following classes embedded in the
	virtual university system will be presented: (1) robots which support
	navigation services and (2) robots which support communication and
	collaboration.},
  issn = {0254-4326}
}
@article{hahsler:GeyerSchulz1998,
  author = {Peter Bruhn and Andreas Geyer-Schulz and Michael Hahsler and Markus
	Mottel},
  title = {Genetic Machine Learning and Intelligent Internet Agents},
  journal = {{\"O}{G}{A}{I}{} Journal},
  year = {1998},
  volume = {17},
  pages = {18--25},
  number = {1},
  abstract = { In this paper we report on the status quo of the current machine
	learning research projects at the Department of Applied Computer
	Science of the Institute of Information Processing and Information
	Economics of the Vienna University of Economics and Business Administration.
	The current research activities can be categorized as follows: (1)
	Development of a theoretic framework of genetic programming. (2)
	Application of genetic programming for managerial and economic decision-making
	and for breeding agents' strategies in organizational learning. (3)
	Development, adaptation, and integration of (intelligent) Internet
	agents for support of the virtual organizations. (4) Development
	of an infrastructure for intelligent Internet agents in the ''Living
	Lectures - Virtual University'' project. (5) Cost-benefit analysis
	of agents, analysis of tactical and strategic consequences of agents
	and the analysis of their economic applications. },
  issn = {0254-4326}
}
@mastersthesis{hahsler:Hahsler1997,
  author = {Michael Hahsler},
  title = {{Software Patterns: Pinw{\"a}nde}},
  school = {Wirtschaftsuniversit{\"a}t Wien},
  year = {1997},
  type = {Diplomarbeit},
  address = {Augasse 2--6, A 1090 Wien, {\"O}sterreich},
  month = nov,
  abstract = {Diese Arbeit besch{\"a}ftigt sich mit dem Pattern-Ansatz f{\"u}r die
	Architektur von Software. Nach einer kurzen Darstellung des Ansatzes
	werden das Pinwand-Pattern und seine Varianten beschrieben. Pinw{\"a}nde
	werden verwendet, um Informationen zu sammeln und Interessierten
	zur Verf{\"u}gung zu stellen. Sie finden unter anderem in den folgenden
	Bereichen Anwendung: Groupware-Anwendungen, Conferencing Systeme,
	Diskussionsforen und Virtuelle Bibliotheken.},
  pdf = {http://michael.hahsler.net/research/diplomarbeit/dipl/pinwand_patterns.pdf},
  category = {software engineering}
}

This file was generated by bibtex2html 1.98.