@inproceedings{agrawal*93:mining, author = {Agrawal, Rakesh and Imielinski, Tomasz and Swami, Arun}, address = {Washington D.C.}, booktitle = {Proceedings of the {ACM-SIGMOD 1993} International Conference on Management of Data}, month = {May}, pages = {207-216}, title = {Mining Associations between Sets of Items in Massive Databases}, year = {1993}, annote = {\location {Internet} \readers{paffhaus} \date{28.04.1998} \keywords{data mining, KDD, association rules} \schlagworte{Data Mining, KDD, Assoziationsregeln} \abstract{Ein effizienter Algorithmus zur Berechnung von Assoziationsregeln unter Verwendung von Buffern und neuen \emph{pruning}-Techniken wird vorgestellt und dessen Effektivit\"at am Beispiel der Datenbank eines Kaufhauses vorgestellt.\par \emph{Original-Abstract: } We are given a large database of customer transactions. Each transaction consists of items purchased by a customer in a visit. We present an efficient algorithm that generates all significant association rules between items in the database. The algorithm incorporates buffer management and novel estimation and pruning techniques. We also present results of applying this algorithm to sales data obtained from a large retailing company, which shows the effectiveness of the algorithm.} }, url = {http://www.almaden.ibm.com/cs/people/ragrawal/papers/sigmod93.ps} } @inproceedings{agrawal*94:fast, author = {Agrawal, Rakesh and Srikant, Ramakrishnan}, address = {Santiago, Chile}, booktitle = {Proceedings of the 20th {VLDB} Conference}, pages = {487-499}, title = {Fast Algorithms for Mining Association Rules}, year = {1994}, annote = {\location {Jens Wolff} \readers{paffhaus} \date{28.04.1998} \keywords{data mining, KDD, apriori, association rules} \schlagworte{Data Mining, KDD, Apriori, Assoziationsregeln} \abstract{In diesem Paper werden zwei Algorithmen zur Bestimmung von Assoziationsregeln (Apriori und AprioriTid) in einer Datenbank vorgestellt. Eine Hybrid-Version, die die Vorteile beider Verfahren verbindet, wird ebenfalls beschrieben.\par \emph{Original-Abstract: } We consider the problem of discovering association rules between items in a large database of sales transactions. We present two new algorithms for solving this problem that are fundamentally different from the known algorithms. Empirical evaluation shows that these algorithms outperform the known algorithms by factors ranging from three for small problems to more than an order of magnitude for large problems. We also show how the best features of the two proposed algorithms can be combined into a hybrid algorithm, called AprioriHybrid. Scale-up experiments show that AprioriHybrid scales linearly with the number of transactions. AprioriHybrid also has excellent scale-up properties with respect to the transaction size and the number of items in the database.} \comment {Der hier vorgeschlagene Algorithmus bildet die Grundlage f\"ur viele der aktuellen Verfahren zum Auffinden von Assoziationsregeln. Die Erstellung der synthetischen Daten, die zum Vergleich von Algorithmen herangezogen werden, wird hier erstmals beschrieben.} } } @article{chen*97:data, author = {Chen, Ming-Syan and Han, Jiawei and Yu, Philip~S.}, journal = {{IEEE} Transactions on Knowledge and Data Engineering}, title = {Data Mining: An Overview from Database Perspective}, year = {1997}, annote = {\location {Internet} \readers{paffhaus} \date{28.04.1998} \keywords{data mining, KDD, overview} \schlagworte{Data Mining, KDD, \"Uberblick} \abstract{Die Autoren bieten einen \"Uberblick \"uber die verschiedenen Teilbereiche des Data Mining aus der Sicht eines Datenbankanwenders.\par \emph{Original-Abstract: } Mining information and knowledge from large databases has been recognized by many researchers as a key research topic in database systems and machine learning, and by many industrial companies as an important area with an opportunity of major revenues. Researchers in many different fields have shown great interest in data mining. Several emerging applications in information providing services, such as data warehousing and on-line services over the Internet, also call for various data mining techniques to better understand user behavior, to improve the service provided, and to increase the business opportunities. In response to such a demand, this article is to provide a survey, from a database researcher's point of view, on the data mining techniques developed recently. A classification of the available data mining techniques is provided, and a comparative study of such techniques is presented.} \comment {Ein guter Artikel, um einen ersten Eindruck vom Themengebiet ``Data Mining'' zu bekommen.} }, url = {ftp://ftp.fas.sfu.ca/pub/cs/han/kdd/survey97.ps} } @inproceedings{cheung*97:maintenance, author = {Cheung, David and Han, Jiawei and Ng, Vincent~T. and Wong, {C.Y.}}, address = {New Orleans, Louisiana, USA}, booktitle = {Proceedings of 1996 International Conference on Data Engineering {(ICDE'96)}}, month = {February}, title = {Maintenance of Discovered Association Rules in Large Databases: An Incremental Updating Technique}, year = {1996}, annote = {\location {Internet} \readers{paffhaus} \date{28.04.1998} \keywords{data mining, KDD, updating of association rules} \schlagworte{Data Mining, KDD, Updating von Assoziationsregeln} \abstract{Die Autoren schlagen ein Verfahren vor, mit dem die aus einer Datenbank extrahierten Assoziationsregeln nach einem Update der Datenbank aktualisiert werden k\"onnen, ohne sie komplett neu erstellen zu m\"ussen.\par \emph{Original-Abstract: } An incremental updating technique is developed for maintenance of the association rules discovered by database mining. There have been many studies on efficient discovery of association rules in large databases. However, it is nontrivial to maintain such discovered rules in large databases because a database may allow frequent or occasional updates and such updates may not only invalidate some existing strong association rules but also turn some weak rules into strong ones. In this study, an incremental updating technique is proposed for efficient maintenance of discovered association rules when new transaction data are added to a transaction database.} }, url = {ftp://ftp.fas.sfu.ca/pub/cs/han/kdd/icde96.ps} } @inproceedings{klemettinen*94:finding, author = {Klemettinen, Mika and Mannila, Heikki and Ronkainen, Pirjo and Toivonen, Hannu and Verkamo, A.~Inkeri}, booktitle = {Third International Conference on Information and Knowledge Management {(CIKM'94)}}, editor = {Adam, {Nabil~R.} and Bhargava, {Bharat~K.} and Yesha, Yelena }, month = {November}, pages = {401-407}, title = {Finding Interesting Rules from Large Sets of Discovered Association Rules}, year = {1994}, annote = {\location {Internet} \readers{paffhaus} \date{28.04.1998} \keywords{data mining, KDD, association Rules} \schlagworte{Data Mining, KDD, Assoziationsregeln} \abstract{Die Autoren zeigen, wie mit Hilfe von \emph{rule templates} beschrieben werden kann, welche der Unmengen von Fakten, die bei einem Mining-Lauf generiert werden, f\"ur den Anwender interessant sind. Ein weiteres Thema ist die Visualisierung der interessanten Ergebnisse.\par \emph{Original-Abstract: } Association rules, introduced by Agrawal, Imielinski, and Swami, are rules of the form ``for 90~\% of the rows of the relation, if the row has value~1 in the columns in set~$W$, then it has 1 also in column~$B$''. Efficient methods exist for discovering association rules from large collections of data. The number of discovered rules can, however, be so large that browsing the rule set and finding interesting rules from it can be quite difficult for the user. We show how a simple formalism of \emph{rule templates} makes it possible to easily describe the structure of interesting rules. We also give examples of visualization of rules, and show how a visualization tool interfaces with rule templates.} }, url = {ftp://ftp.cs.helsinki.fi/pub/Reports/by\_Project/PMDM/\\Finding\_Interesting\_Rules\_from\_Large\_Sets\_of\_Discovered\_Association\_Rules.ps.gz} } @techreport{mueller95:fast, author = {Mueller, Andreas}, institution = {University of Maryland}, month = {August}, title = {Fast Sequential and Parallel Algorithms for Association Rule Mining: A Comparison}, year = {1995}, annote = {\location {Internet} \readers{paffhaus} \date{28.04.1998} \keywords{data mining, KDD, sequential algorithms, parallel algorithms, association rules} \schlagworte{Data Mining, KDD, sequenzielle Algorithmen, parallele Algorithmen, Assoziationsregeln} \abstract{Der Autor beschreibt mehrere sequenzielle und parallele Algorithmen zum erstellen von Assoziationsregeln und untersucht ihr Laufzeitverhalten anhand von synthetischen Datenbanken.\par \emph{Original-Abstract: } The field of knowledge discovery in databases, or Data Mining, has received increasing attention during recent years as large organizations have begun to realize the potential value of the information that is stored implicitly in their databases. One specific data mining task is the mining of Association Rules, particularly from retail data. The task is to determine patterns (or rules) that characterize the shopping behavior of customers from a large database of previous consumer transactions. The rules can then be used to focus marketing efforts such as product placement and sales promotions.\par Because early algorithms required an unpredictably large number of IO operations, reducing IO cost has been the primary target of the algorithms presented in the literature. One of the most recent proposed algorithms, called PARTITION, uses a new TID-list data representation and a new partitioning technique. The partitioning technique reduces IO cost to a constant amount by processing one database portion at a time in memory. We implemented an algorithm called SPTID that incorporates both TID-lists and partitioning to study their benefits. For comparison, a non-partitioning algorithm called SEAR, which is based on a new prefix-tree data structure, is used. Our experiments with SPTID and SEAR indicate that TID-lists have inherent inefficiencies; furthermore, because all of the algorithms tested tend to be CPU-boundn trading CPU-overhead against I/O operations by partitioning did not lead to better performance.\par In order to scale mining algorithms to the huge databases (e.g., multiple Terabytes) that large organizations will manage in the near future, we implemented parallel versions of SEAR and SPEAR (its partitioned counterpart). The performance results show that, while both algorithms parallelize easily and obtain good speedup and scale-up results, the parallel SEAR version performs better than parallel SPEAR, despite the fact that it uses more communication.} \comment {Zus\"atzlich zu den eigentlichen Ergebnissen bietet dieser Artikel eine gute Einf\"uhrung und einen \"Uberblick \"uber m\"ogliche Ans\"atze beim Mining nach Assoziationsregeln.} }, url = {ftp://ftp.cs.umd.edu/pub/papers/papers/3515/3515.ps.Z} } @inproceedings{ng*98:exploratory, author = {Ng, Raymond~T. and Lakshmanan, Laks~V.S. and Han, Jiawei and Pang, Alex}, address = {Seattle, Washington}, booktitle = {Proceedings of 1998 {ACM-SIGMOD} Conference on Management of Data}, month = {June}, title = {Exploratory Mining and Pruning Optimizations of Constrained Associations Rules}, year = {1998}, annote = {\location {Internet} \readers{paffhaus} \date{28.04.1998} \keywords{data mining, KDD, constrained association rules, exploratory mining} \schlagworte{Data Mining, KDD, eingeschr\"ankte Assoziationsregeln, anwendergest\"utztes Mining} \abstract{In diesem Paper wird eine M\"oglichkeit vorgestellt, wie die Anwender eines Data Mining-Systems besser in den Vorgang des Minings eingebunden werden k\"onnen. Auf diese Weise ist es m\"oglich, die Anzahl der gefundenen Assoziationsregeln zu vermindern und auf potentiell interessante Regeln zu beschr\"anken. Diese Auswahl wird noch weiter eingeengt durch Ber\"ucksichtigung von Auswahlkriterien, die beim Stellen der Anfrage festgelegt werden.\par \emph{Original-Abstract: } From the standpoint of supporting human-centered discovery of knowledge, the present-day model of mining association rules suffers from the following serious shortcomings: (i) lack of user exploration and control, (ii) lack of focus, and (iii) rigid notion of relationships. In effect, this model functions as a black-box, admitting little user interaction in between. We propose, in this paper, an architecture that opens up the black-box, and supports constraintbased, human-centered exploratory mining of associations. The foundation of this architecture is a rich set of constraint constructs, including domain, class, and SQL-style aggregate constraints, which enable users to clearly specify what associations are to be mined. We propose \emph{constrained association queries} as a means of specifying the constraints to be satisfied by the antecedent and consequent of a mined association.\par In this paper, we mainly focus on the technical challenges in guaranteeing a level of performance that is commensurate with the selectivities of the constraints in an association query. To this end, we introduce and analyze two properties of constraints that are critical to pruning: \emph{antimonotonicity} and \emph{succinctness}. We then develop characterizations of various constraints into four categories, according to these properties. Finally, we describe a mining algorithm called CAP, which achieves a maximized degree of pruning for all categories of constraints. Experimental results indicate that CAP can run much faster, in some cases as much as 80~times, than several basic algorithms. This demonstrates how important the succinctness and anti-monotonicity properties are, in delivering the performance guarantee.} }, url = {ftp://ftp.fas.sfu.ca/pub/cs/han/kdd/sigmod98.ps} } @article{park*95:effective, author = {Park, {Jong Soo} and Chen, Ming-Syan and Yu, Philip~S.}, journal = {{SIGMOD} Record ({ACM} Special Interest Group on Management of Data)}, pages = {175-186}, title = {An Effective Hash-Based Algorithm for Mining Association Rules}, year = {1995}, annote = {\location {Jens Wolff} \readers{paffhaus} \date{28.04.1998} \keywords{data mining, KDD, association rules, hashing} \schlagworte{Data Mining, KDD, Assoziationsregeln, Hashing} \abstract{Der hier beschriebene Algorithmus benutzt ein Hash-Verfahren zur Erstellung von Assoziationsregeln. Es werden weniger Kandidaten f\"ur die Menge der gro{\ss}en \emph{itemsets} generiert, wodurch die Berechnung erheblich beschleunigt wird.\par \emph{Original-Abstract: } In this paper, we examine the issue of mining association rules among items in a large database of sales transactions. The mining of association rules can be mapped into the problem of discovering large itemsets where a large itemset is a group of items which appear in a sufficient number of transactions. The problem of discovering large itemsets can be solved by constructing a candidate set of itemsets first and then, identifying, within this candidate set, those itemsets that meet the large itemset requirement. Generally this is done iteratively for each large $k$-itemset in increasing order of $k$ where a large $k$-itemset is a large itemset with $k$~items. To determine large itemsets from a huge number of candidate large itemsets in early iterations is usually the dominating factor for the overall data mining performance. To adress this issue, we propose an effective hash-based algorithm for the candidate set generation. Explicitly, the number of candidate 2-itemsets generated by the proposed algorithm is, in orders of magnitude, smaller than that by previous methods, thus resolving the performance bottleneck. Note that the generation of smaller candidate sets enables us to effectively trim the transaction database size at a much earlier stage of the iterations, thereby reducing the computational cost for later iterations significantly. Extensive simulation study is conducted to evaluate performance of the proposed algorithm.} } } @inproceedings{park*95:efficient, author = {Park, {Jong Soo} and Chen, Ming-Syan and Yu, Philip~S.}, address = {Baltimore}, booktitle = {Proceedings of the 4th {ACM CIKM}}, pages = {31-36}, title = {Efficient Parallel Data Mining for Association Rules}, year = {1995}, annote = {\location {Jens Wolff} \readers{paffhaus} \date{28.04.1998} \keywords{data mining, KDD, association rules, parallel algorithms} \schlagworte{Data Mining, KDD, Assoziationsregeln, parallele Algorithmen} \abstract{Der hier vorgestellte Algorithmus `PDM' berechnet die Assoziationsregeln, indem er den einzelnen Prozessoren eines Parallelrechners Partitionen der Datenbank zuordnet. Die lokal gro{\ss}en `itemsets' werden nach jedem Durchlauf vereinigt und die global gro{\ss}en `itemsets' werden bestimmt. Die Grundlage f\"ur PDM ist der sequenzielle DHP-Algorithmus.\par \emph{Original-Abstract: } In this paper, we develop an algorithm, called PDM, to conduct parallel data mining for association rules. Consider a transaction as a collection of items, and a large itemset is a set of items such that the number of transactions containing it exceeds a pre-defined threshold. PDM is so designed that the global set of large itemsets can be identified efficiently and the amount of inter-node data exchange required is minimized. Specifically, with a given database partition, each processing node will collect (count) information on each itemset from its local database efficiently via a hashing method. The information discovered by each node is next shared with other nodes via some communication schemes. Then, PDM employs a technique, called \emph{clue-and-poll}, to adress the uncertainty due to the partial knowledge collected at each node by judiciously selecting a small fraction of the itemsets for the exchange of count information among nodes, thus reducing the communication cost. The global set of large itemsets can hence be determined based on the aggregate count of itemsets. It is experimentally shown that PDM not only attains very good parallelization efficiencies, but also provides robust performance for various input patterns.} } } @inproceedings{srikant*96:mining, author = {Srikant, Ramakrishnan and Agrawal, Rakesh}, address = {Montreal, Canada}, booktitle = {Proceedings of the {ACM-SIGMOD} 1996 Conference on Management of Data}, month = {June}, title = {Mining Quantitative Association Rules in Large Relational Tables}, year = {1996}, annote = {\location {Internet} \readers{paffhaus} \date{28.04.1998} \keywords{data mining, KDD, quantitative association rules} \schlagworte{Data Mining, KDD, Quantitative Assoziationsregeln} \abstract{Der Artikel beschreibt ein Verfahren, das es erm\"oglicht, quantisierte oder kategorisierte Assoziationsregeln aus einer Datenbank zu extrahieren. So wird es z.B. m\"oglich, eine Regel der form ($<$Age: 30..39$>$ and $<$Married: Yes$>$ $Rightarrow$ $<$BumCars: 2$>$) aus einer Datenbank mit den Attributen ``Age'', ``Married'' und ``NumCars'' zu erhalten.\par \emph{Original-Abstract: } We introduce the problem of mining association rules in large relational tables containing both quantitative and categorical attributes. An example of such an association might be ``10~\% of married people between age 50 and 60 have at least 2 cars''. We deal with quantitative attributes by fine-partitioning the values of the attribute and then combining adjacent partitions as necessary. We introduce measures of partial completeness which quantify the information lost due to partitioning. A direct application of this technique can generate too many similar rules. We tackle this problem by using a ``greater-than-expected-value'' interest measure to identify the interesting rules in the output. We give an algorithm for mining such quantitative association rules. Finally, we describe the results of using this approach on a real-life dataset.} }, url = {http://www.almaden.ibm.com/cs/people/ragrawal/papers/sigmond96.ps} } @inproceedings{srikant*97:mining, author = {Srikant, Ramakrishnan and Vu, Quoc and Agrawal, Rakesh}, address = {Newport Beach, California}, booktitle = {Proceedings of the 3rd International Conference on Knowledge Discovery in Databases and Data Mining}, month = {August}, title = {Mining Association Rules with Item Constraints}, year = {1997}, annote = {\location {Internet} \readers{paffhaus} \date{28.04.1998} \keywords{data mining, KDD, association rules, item constraints} \schlagworte{Data Mining, KDD, Assoziationsregeln, eingesch\"ankte Regeln} \abstract{In diesem Paper Artikel wird ein Verfahren vorgestellt, das beim Erstellen von Assoziationsregeln die Interessen des Anwenders ber\"ucksichtigt und nur solche Regeln generiert, die gegebenen Vorgaben gen\"ugen. Dadurch, da{\ss} nur eine Teilmenge der eigentlich m\"oglichen Menge von Regeln generiert werden mu{\ss}, wird die Laufzeit deutlich reduziert.\par \emph{Original-Abstract: } The problem of discovering association rules has received considerable research attention and several fast algorithms for mining association rules have been developed. In practice, users are often interested in a subset of association rules. For example, they may only want rules that contain a specific item or rules that contain children of a specific item in a hierarchy. While such constraints can be applied as a post-processing step, integrating them into the mining algorithm can dramatically reduce the execution time. We consider the problem of integrating constraints that are boolean expressions over the presence or absence of items into the association discovery algorithm. We present three integrated algorithms for mining association rules with item constraints and discuss their tradeoffs.} }, url = {http://www.almaden.ibm.com/cs/people/ragrawal/papers/kdd97\_const.ps} } @article{toivonen*95:pruning, author = {Toivonen, {H.} and Klemettinen, {M.} and Ronkainen, {P.} and H\"at\"onen, {K.} and Mannila, {H.}}, journal = {{MLnet} Workshop on Statistics, Machine Learning, and Discovery in Databases}, pages = {47-52}, title = {Pruning and Grouping Discovered Association Rules}, year = {1995}, annote = {\location {Internet} \readers{paffhaus} \date{28.04.1998} \keywords{data mining, KDD, Association Rules} \schlagworte{Data Mining, KDD, Assoziationsregeln} \abstract{Dieses Paper besch\"aftigt sich mit dem Thema, wie man die Unmenge der Assoziationsregeln, die bei einem Data Mining-Durchlauf anfallen, \"ubersichtlicher gestalten kann. Hierzu werden sog. \emph{rule covers} eingef\"uhrt. Ein weiteres Thema ist die gruppierte Anordnung von zusammengeh\"orenden Regeln.\par \emph{Original-Abstract: } Association rules are statements of the form ``for 90~\% of the rows of the relation, if the row has value~1 in the columns in set~X, then it has 1 also in the columns in set~Y.'' Efficient methods exist for discovering association rules from large collections of data. The number of discovered rules can, however, be so large that the rules cannot be presented to the user. We show how the set of rules can be pruned by forming rule covers. A rule cover is a subset of the original set of rules such that for each row in the relation there is an applicable rule in the cover if and only if there is an applicable rule in the original set. We also discuss grouping of association rules by clustering, and present some experimental results of both pruning and grouping.} }, url = {ftp://ftp.cs.helsinki.fi/pub/Reports/by\_Project/PMDM/\\Pruning\_and\_Grouping\_Discovered\_Association\_Rules.ps.gz} } @inproceedings{toivonen96:sampling, author = {Toivonen, Hannu}, address = {Mumbai (Bombay), India}, booktitle = {Proceedings of the 22nd {VLDB} Conference}, pages = {134-145}, title = {Sampling Large Databases for Association Rules}, year = {1996}, annote = {\location {Internet} \readers{paffhaus} \date{28.04.1998} \keywords{data mining, KDD, association rules, probabilistic algorithm} \schlagworte{Data Mining, KDD, Assoziationsregeln, randomisierter Algorithmus} \abstract{Dieser Artikel zeigt, da{\ss} Assoziationsregeln fast immer in einem einzigen Scan der Datenbank gefunden werden k\"onnen. Dies wird durch einen randomisierten Ansatz erreicht.\par \emph{Original-Abstract: } Discovery of association rules is an important database mining problem. Current algorithms for finding association rules require several passes over the analyzed database, and obviously the role of I/O overhead is very significant for very large databases. We present new algorithms that reduce the database activity considerably. The idea is to pick a random sample, to find using this sample all association rules that probably hold in the whole database, and then to verify the results with the rest of the database. The algorithms thus produce exact association rules, not approximations based on a sample. The approach is, however, probabilistic, and in those rare cases where our sampling method does not produce all association rules, the missing rules can be found in a second pass. Our experiments show that the proposed algorithms can find association rules very efficiently in only one database pass.} }, url = {http://www.cs.helsinki.fi/research/fdk/datamining/pubs/vldb96.ps.gz} } @inproceedings{zaki*96:evaluation, author = {Zaki, {Mohammed Javeed} and Parthasarathy, Srinivasan and Li, Wei and Ogihara, Mitsunori}, address = {Birmingham, UK}, booktitle = {7th International Workshop on Research Issues in Data Engineering {(RIDE'97)}}, month = {May}, title = {Evaluation of Sampling for Data Mining of Association Rules}, year = {1996}, annote = {\location {Internet} \readers{paffhaus} \date{28.04.1998} \keywords{data mining, KDD, association rules, sampling} \schlagworte{Data Mining, KDD, Assoziationsregeln, Stichproben} \abstract{Die Autoren zeigen, da{\ss} es m\"oglich ist, durch die zuf\"allige Auswahl von Stichproben die Bearbeitungszeit bei der Erstellung von Assoziationsregeln drastisch zu reduzieren und dabei dennoch ein akurates Ergebnis zu erhalten.\par \emph{Original-Abstract: } Data mining is an emerging research area, whose goal is to extract significant patterns or interesting rules from large databases. High-level inference from large volumes of routine business data can provide valuable information to businesses, such as customer buying patterns, shelving criterion in supermarkets and stock trends. However, many algorithms proposed for data mining of association rules make repeated passes over the database to determine the commonly occurring \emph{itemsets} (or set of items). For large databases, the I/O overhead in scanning the database can be extremely high.\par In this paper we show that random sampling of transactions in the database is an effective method for finding association rules. Sampling can speed up the mining process by more than an order of magnitude by reducing I/O costs and drastically shrinking the number of transaction to be considered. We may also be able to make the sampled database resident in main-memory. Furthermore, we show that sampling can accurately represent the data patterns in the database with high confidence. We experimentally evaluate the effectiveness of sampling on three databases.} }, url = {ftp://ftp.cs.rochester.edu/pub/papers/systems/\\97.RIDE.Eval\_\_of\_sampling\_for\_data\_mining\_of\_assoc\_rules.ps.gz} } @inproceedings{zaki*97:localized, author = {Zaki, {Mohammed Javeed} and Parthasarathy, Srinivasan and Li, Wei}, address = {Newport, Rhode Island}, booktitle = {9th Annual {ACM} Symposium on Parallel Algorithms and Architectures {(SPAA)}}, month = {June}, title = {A Localized Algorithm for Parallel Association Mining}, year = {1997}, annote = {\location{Internet} \readers{paffhaus} \date{28.04.1998} \keywords{data mining, KDD, parallel computing, association rules} \schlagworte{Data Mining, KDD, Parallele Algorithmen, Assoziationsregeln} \abstract{Die Autoren schlagen einen parallelen Algorithmus zur Bestimmung von Assoziationsregeln vor, der nach einer Initialisierungsphase ohne weitere Kommunikation oder Synchronisation auskommt und somit den parallelen Algorithmen inh\"arenten Flaschenhals umgeht. Erreicht wird dies durch geeignete Anordnung verwandter \emph{itemsets}.\par \emph{Original-Abstract: }Discovery of association rules is an important database mining problem. Mining for association rules involves extracting patterns from large databases and inferring useful rules from them. Several parallel and sequential algorithms have been proposed in the literature to solve this problem. Almost all of these algorithms make repeated passes over the database to determine the commonly occurring patterns or \emph{itemsets} (set of items), thus incurring high I/O overhead. In the parallel case, these algorithms do a reduction at the end of each pass to construct the global patterns, thus incurring high synchronization cost.\par In this paper we describe a new parallel association mining algorithm. Our algorithm is a result of detailed study of the available parallelism and the properties of associations. The algorithm uses a scheme to cluster related frequent itemsets together, and to partition them among the processors. At the same time it also uses a different database layout which clusters related transactions together, and selectively replicates the database so that the portion of the database needed for the computation of associations is local to each processor. After the initial set-up phase, the algorithm eliminates the need for further communication or synchronization. The algorithm further scans the local database partition only three times, thus minimizing I/O overheads. Unlike previous approaches, the algorithms uses simple intersection operations to compute frequent itemsets and doesn't have to maintain or search complex hash structures.\par Our experimental testbed is a 32-processor DEC Alpha cluster inter-connected by the Memory Channel network. We present results on the performance of our algorithm on various databases, and compare it against a well known parallel algorithm. Our algorithm outperforms it by an more than an order of magnitude.} }, url = {ftp://ftp.cs.rochester.edu/pub/papers/systems/\\97.SPAA.Localized\_algorithm\_for\_parallel\_association\_mining.ps.gz} } @article{zaki*97:parallel, author = {Zaki, Mohammed~J. and Parthasarathy, Srinivasan and Ogihara, Mitsunori and Li, Wei}, journal = {Data Mining and Knowledge Discovery}, pages = {1-32}, publisher = {Kluwer Academic Publishers, Boston}, title = {Parallel Algorithms for Discovery of Association Rules}, year = {1997}, annote = {\location {Internet} \readers{paffhaus} \date{28.04.1998} \keywords{data mining, KDD, parallel algorithms, association rules} \schlagworte{Data Mining, KDD, parallele Algorithmen, Assoziationsregeln} \abstract{Die Autoren beschreiben einen parallele Algorithmen zur Generierung von Assoziationsregeln. Durch die geschickte Anordnung der Daten wird hierbei die n\"otige Kommunikation der Prozessoren untereinander minimiert und somit die Laufzeit minimiert.\par \emph{Original-Abstract: } Discovery of association rules is an important data mining task. Several parallel and sequential algorithms have been proposed in the literature to solve this problem. Almost all of these algorithms make repeated passes over the database to determine the set of frequent itemsets (a subset of database items), thus incurring high I/O overhead. In the parallel case, most algorithms perform a sum-reduction at the end of each pass to construct the global counts, also incurring high synchronization cost.\par In this paper we describe new parallel association mining algorithms. The algorithms use novel itemset clustering techniques to approximate the set of potentially maximal frequent itemsets. Once this set has been identified, the algorithms make use of efficient traversal techniques to generate the frequent itemsets contained in each cluster. We propose two clustering schemes based on equivalence classes and maximal hypergraph cliques, and study two lattice traversal techniques based on bottom-up and hybrid search. We use a vertical database layout to cluster related transactions together. The database is also selectively replicated so that the portion of the database needed for the computation of associations is local to each processor. After the initial set-up phase, the algorithms do not need any further communication or synchronization. The algorithms minimize I/O overheads by scanning the local database portion only twice. Once in the set-up phase, and once when processing the itemset clusters. Unlike previous parallel approaches, the algorithms use simple intersection operations to compute frequent itemsets and do not have to maintain or search complex hash structures.\par Our experimental testbed is a 32-processor DEC Alpha cluster inter-connected by the Memory Channel network. We present results on the performance of our algorithms on various databases, and compare it against a well known parallel algorithm. The best new algorithm outperforms it by an order of magnitude.} } }