작성일자 : 2023-08-23
Ver 0.1.1
DataFrame
- Pandas의 Series가 1차원 형태의 자료구조라면 DataFrame은 여러 개의 열로 구성된 2차원 형태의 자료구조임
- numpy array를 받아 만들 수 있으며, Series 처럼 변환 가능한 오브젝트들을 갖고 있는 dict 형태를 인자로 넣어주어 DataFrame을 만들 수 있음
ex = pd.DataFrame({'A': 1.,
'B': pd.Timestamp('20130102'),
'C': pd.Series(1, index=list(range(5)), dtype='float32'),
'D': np.array(np.arange(3,8,1), dtype='int32'),
'E': pd.Categorical(['test', 'train', 'test', 'train','test']),
'F': 'foo'})
ex
|
A |
B |
C |
D |
E |
F |
0 |
1.0 |
2013-01-02 |
1.0 |
3 |
test |
foo |
1 |
1.0 |
2013-01-02 |
1.0 |
4 |
train |
foo |
2 |
1.0 |
2013-01-02 |
1.0 |
5 |
test |
foo |
3 |
1.0 |
2013-01-02 |
1.0 |
6 |
train |
foo |
4 |
1.0 |
2013-01-02 |
1.0 |
7 |
test |
foo |
- DataFrame의 컬럼들은 각기 특별한 자료형을 갖고 있음
- 이는 DataFrame 내에 있는 dtypes라는 속성을 통해 확인 가능함
- 파이썬의 기본적인 소수점은 float64로 잡히고, 기본적은 문자열은 str이 아니라 object라는 자료형으로 나타남
ex.dtypes
A float64
B datetime64[ns]
C float32
D int32
E category
F object
dtype: object
ex2 = pd.DataFrame(np.random.randn(5,2), columns = ['A','B'])
ex2
|
A |
B |
0 |
0.406282 |
0.368642 |
1 |
1.140642 |
-0.077410 |
2 |
-0.812914 |
-0.273615 |
3 |
-0.448528 |
0.854089 |
4 |
-0.237988 |
0.196074 |
ex2.head() #기본값 = 5
|
A |
B |
0 |
0.406282 |
0.368642 |
1 |
1.140642 |
-0.077410 |
2 |
-0.812914 |
-0.273615 |
3 |
-0.448528 |
0.854089 |
4 |
-0.237988 |
0.196074 |
ex2.tail(3)
|
A |
B |
2 |
-0.812914 |
-0.273615 |
3 |
-0.448528 |
0.854089 |
4 |
-0.237988 |
0.196074 |
ex2.index #check index
RangeIndex(start=0, stop=5, step=1)
ex2.columns #check columns
Index(['A', 'B'], dtype='object')
ex2.values #check values
array([[ 0.40628183, 0.36864168],
[ 1.14064213, -0.07741036],
[-0.81291384, -0.27361517],
[-0.44852821, 0.85408865],
[-0.23798752, 0.1960742 ]])
ex2.describe() #simple statistic information of DataFrame
|
A |
B |
count |
5.000000 |
5.000000 |
mean |
0.009499 |
0.213556 |
std |
0.772063 |
0.434924 |
min |
-0.812914 |
-0.273615 |
25% |
-0.448528 |
-0.077410 |
50% |
-0.237988 |
0.196074 |
75% |
0.406282 |
0.368642 |
max |
1.140642 |
0.854089 |
ex2.sort_index(axis =1 , ascending = False)
|
B |
A |
0 |
0.368642 |
0.406282 |
1 |
-0.077410 |
1.140642 |
2 |
-0.273615 |
-0.812914 |
3 |
0.854089 |
-0.448528 |
4 |
0.196074 |
-0.237988 |
ex2.sort_index(axis = 0 , ascending = False)
|
A |
B |
4 |
-0.237988 |
0.196074 |
3 |
-0.448528 |
0.854089 |
2 |
-0.812914 |
-0.273615 |
1 |
1.140642 |
-0.077410 |
0 |
0.406282 |
0.368642 |
ex2.sort_index()
|
A |
B |
0 |
0.406282 |
0.368642 |
1 |
1.140642 |
-0.077410 |
2 |
-0.812914 |
-0.273615 |
3 |
-0.448528 |
0.854089 |
4 |
-0.237988 |
0.196074 |
ex2.sort_values(by='B')
|
A |
B |
2 |
-0.812914 |
-0.273615 |
1 |
1.140642 |
-0.077410 |
4 |
-0.237988 |
0.196074 |
0 |
0.406282 |
0.368642 |
3 |
-0.448528 |
0.854089 |
Selection using pandas
ex2
|
A |
B |
0 |
0.406282 |
0.368642 |
1 |
1.140642 |
-0.077410 |
2 |
-0.812914 |
-0.273615 |
3 |
-0.448528 |
0.854089 |
4 |
-0.237988 |
0.196074 |
ex2['A']
0 0.406282
1 1.140642
2 -0.812914
3 -0.448528
4 -0.237988
Name: A, dtype: float64
ex2.A
0 0.406282
1 1.140642
2 -0.812914
3 -0.448528
4 -0.237988
Name: A, dtype: float64
ex2[['A']]
|
A |
0 |
0.406282 |
1 |
1.140642 |
2 |
-0.812914 |
3 |
-0.448528 |
4 |
-0.237988 |
type(ex2['A'])
pandas.core.series.Series
type(ex2[['A']])
pandas.core.frame.DataFrame
ex2[0:3]
|
A |
B |
0 |
0.406282 |
0.368642 |
1 |
1.140642 |
-0.077410 |
2 |
-0.812914 |
-0.273615 |
Merge DataFrame
df1 = pd.DataFrame({'key' : list('ABCDE'),
'value' : np.random.randn(5)})
df1
|
key |
value |
0 |
A |
-0.604176 |
1 |
B |
-0.882808 |
2 |
C |
-0.253994 |
3 |
D |
0.461608 |
4 |
E |
-0.507770 |
df2 = pd.DataFrame({'key' : list('ABCXZ'),
'value' : np.random.randn(5)})
df2
keyvalue
|
key |
value |
0 |
A |
0.071220 |
1 |
B |
0.957223 |
2 |
C |
0.622761 |
3 |
X |
1.802048 |
4 |
Z |
-0.531795 |
pd.concat([df1,df2]) # axis = 0 (Default), concat by rows
|
key |
value |
0 |
A |
-0.604176 |
1 |
B |
-0.882808 |
2 |
C |
-0.253994 |
3 |
D |
0.461608 |
4 |
E |
-0.507770 |
0 |
A |
0.071220 |
1 |
B |
0.957223 |
2 |
C |
0.622761 |
3 |
X |
1.802048 |
4 |
Z |
-0.531795 |
pd.concat([df1, df2], axis = 0, ignore_index = True)
|
key |
value |
0 |
A |
-0.604176 |
1 |
B |
-0.882808 |
2 |
C |
-0.253994 |
3 |
D |
0.461608 |
4 |
E |
-0.507770 |
5 |
A |
0.071220 |
6 |
B |
0.957223 |
7 |
C |
0.622761 |
8 |
X |
1.802048 |
9 |
Z |
-0.531795 |
pd.concat([df1, df2]).reset_index()
|
index |
key |
value |
0 |
0 |
A |
-0.604176 |
1 |
1 |
B |
-0.882808 |
2 |
2 |
C |
-0.253994 |
3 |
3 |
D |
0.461608 |
4 |
4 |
E |
-0.507770 |
5 |
0 |
A |
0.071220 |
6 |
1 |
B |
0.957223 |
7 |
2 |
C |
0.622761 |
8 |
3 |
X |
1.802048 |
9 |
4 |
Z |
-0.531795 |
pd.concat([df1,df2], axis = 1)
|
key |
value |
key |
value |
0 |
A |
-0.604176 |
A |
0.071220 |
1 |
B |
-0.882808 |
B |
0.957223 |
2 |
C |
-0.253994 |
C |
0.622761 |
3 |
D |
0.461608 |
X |
1.802048 |
4 |
E |
-0.507770 |
Z |
-0.531795 |
df2.columns = ['key','values2']
df2
|
key |
values2 |
0 |
A |
0.071220 |
1 |
B |
0.957223 |
2 |
C |
0.622761 |
3 |
X |
1.802048 |
4 |
Z |
-0.531795 |
pd.concat([df1,df2])
keyvaluevalues2
|
key |
value |
values2 |
0 |
A |
-0.604176 |
NaN |
1 |
B |
-0.882808 |
NaN |
2 |
C |
-0.253994 |
NaN |
3 |
D |
0.461608 |
NaN |
4 |
E |
-0.507770 |
NaN |
0 |
A |
NaN |
0.071220 |
1 |
B |
NaN |
0.957223 |
2 |
C |
NaN |
0.622761 |
3 |
X |
NaN |
1.802048 |
4 |
Z |
NaN |
-0.531795 |
pd.merge()
df1
|
key |
value |
0 |
A |
-0.604176 |
1 |
B |
-0.882808 |
2 |
C |
-0.253994 |
3 |
D |
0.461608 |
4 |
E |
-0.507770 |
df2
|
key |
value2 |
0 |
A |
0.071220 |
1 |
B |
0.957223 |
2 |
C |
0.622761 |
3 |
X |
1.802048 |
4 |
Z |
-0.531795 |
pd.merge(df1, df2, on = 'key', how = 'inner')
|
key |
value |
values2 |
0 |
A |
-0.604176 |
0.071220 |
1 |
B |
-0.882808 |
0.957223 |
2 |
C |
-0.253994 |
0.622761 |
pd.merge(df1, df2, on = 'key', how = 'left')
|
key |
value |
values2 |
0 |
A |
-0.604176 |
0.071220 |
1 |
B |
-0.882808 |
0.957223 |
2 |
C |
-0.253994 |
0.622761 |
3 |
D |
0.461608 |
NaN |
4 |
E |
-0.507770 |
NaN |
pd.merge(df1, df2, on = 'key', how = 'right')
|
key |
value |
values2 |
0 |
A |
-0.604176 |
0.071220 |
1 |
B |
-0.882808 |
0.957223 |
2 |
C |
-0.253994 |
0.622761 |
3 |
X |
NaN |
1.802048 |
4 |
Z |
NaN |
-0.531795 |
pd.merge(df1, df2, on = 'key', how = 'outer')
|
key |
value |
values2 |
0 |
A |
-0.604176 |
0.071220 |
1 |
B |
-0.882808 |
0.957223 |
2 |
C |
-0.253994 |
0.622761 |
3 |
D |
0.461608 |
NaN |
4 |
E |
-0.507770 |
NaN |
5 |
X |
NaN |
1.802048 |
6 |
Z |
NaN |
-0.531795 |
Practice using data set - iris dataset
from sklearn.datasets import load_iris
print(iris) # 로드된 데이터가 속성-스타일 접근을 제공하는 딕셔너리와 번치 객체로 표현된 것을 확인
print(iris.DESCR) # Description 속성을 이용해서 데이터셋의 정보를 확인
# 각 key에 저장된 value 확인
# feature
print(iris.data)
print(iris.feature_names)
# label
print(iris.target)
print(iris.target_names)
# feature_names 와 target을 레코드로 갖는 데이터프레임 생성
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
{'data': array([[5.1, 3.5, 1.4, 0.2],
[4.9, 3. , 1.4, 0.2],
[4.7, 3.2, 1.3, 0.2],
[4.6, 3.1, 1.5, 0.2],
[5. , 3.6, 1.4, 0.2],
[5.4, 3.9, 1.7, 0.4],
[4.6, 3.4, 1.4, 0.3],
[5. , 3.4, 1.5, 0.2],
[4.4, 2.9, 1.4, 0.2],
[4.9, 3.1, 1.5, 0.1],
[5.4, 3.7, 1.5, 0.2],
[4.8, 3.4, 1.6, 0.2],
[4.8, 3. , 1.4, 0.1],
[4.3, 3. , 1.1, 0.1],
[5.8, 4. , 1.2, 0.2],
[5.7, 4.4, 1.5, 0.4],
[5.4, 3.9, 1.3, 0.4],
[5.1, 3.5, 1.4, 0.3],
[5.7, 3.8, 1.7, 0.3],
[5.1, 3.8, 1.5, 0.3],
[5.4, 3.4, 1.7, 0.2],
[5.1, 3.7, 1.5, 0.4],
[4.6, 3.6, 1. , 0.2],
[5.1, 3.3, 1.7, 0.5],
[4.8, 3.4, 1.9, 0.2],
[5. , 3. , 1.6, 0.2],
[5. , 3.4, 1.6, 0.4],
[5.2, 3.5, 1.5, 0.2],
[5.2, 3.4, 1.4, 0.2],
[4.7, 3.2, 1.6, 0.2],
[4.8, 3.1, 1.6, 0.2],
[5.4, 3.4, 1.5, 0.4],
[5.2, 4.1, 1.5, 0.1],
[5.5, 4.2, 1.4, 0.2],
[4.9, 3.1, 1.5, 0.2],
[5. , 3.2, 1.2, 0.2],
[5.5, 3.5, 1.3, 0.2],
[4.9, 3.6, 1.4, 0.1],
[4.4, 3. , 1.3, 0.2],
[5.1, 3.4, 1.5, 0.2],
[5. , 3.5, 1.3, 0.3],
[4.5, 2.3, 1.3, 0.3],
[4.4, 3.2, 1.3, 0.2],
[5. , 3.5, 1.6, 0.6],
[5.1, 3.8, 1.9, 0.4],
[4.8, 3. , 1.4, 0.3],
[5.1, 3.8, 1.6, 0.2],
[4.6, 3.2, 1.4, 0.2],
[5.3, 3.7, 1.5, 0.2],
[5. , 3.3, 1.4, 0.2],
[7. , 3.2, 4.7, 1.4],
[6.4, 3.2, 4.5, 1.5],
[6.9, 3.1, 4.9, 1.5],
[5.5, 2.3, 4. , 1.3],
[6.5, 2.8, 4.6, 1.5],
[5.7, 2.8, 4.5, 1.3],
[6.3, 3.3, 4.7, 1.6],
[4.9, 2.4, 3.3, 1. ],
[6.6, 2.9, 4.6, 1.3],
[5.2, 2.7, 3.9, 1.4],
[5. , 2. , 3.5, 1. ],
[5.9, 3. , 4.2, 1.5],
[6. , 2.2, 4. , 1. ],
[6.1, 2.9, 4.7, 1.4],
[5.6, 2.9, 3.6, 1.3],
[6.7, 3.1, 4.4, 1.4],
[5.6, 3. , 4.5, 1.5],
[5.8, 2.7, 4.1, 1. ],
[6.2, 2.2, 4.5, 1.5],
[5.6, 2.5, 3.9, 1.1],
[5.9, 3.2, 4.8, 1.8],
[6.1, 2.8, 4. , 1.3],
[6.3, 2.5, 4.9, 1.5],
[6.1, 2.8, 4.7, 1.2],
[6.4, 2.9, 4.3, 1.3],
[6.6, 3. , 4.4, 1.4],
[6.8, 2.8, 4.8, 1.4],
[6.7, 3. , 5. , 1.7],
[6. , 2.9, 4.5, 1.5],
[5.7, 2.6, 3.5, 1. ],
[5.5, 2.4, 3.8, 1.1],
[5.5, 2.4, 3.7, 1. ],
[5.8, 2.7, 3.9, 1.2],
[6. , 2.7, 5.1, 1.6],
[5.4, 3. , 4.5, 1.5],
[6. , 3.4, 4.5, 1.6],
[6.7, 3.1, 4.7, 1.5],
[6.3, 2.3, 4.4, 1.3],
[5.6, 3. , 4.1, 1.3],
[5.5, 2.5, 4. , 1.3],
[5.5, 2.6, 4.4, 1.2],
[6.1, 3. , 4.6, 1.4],
[5.8, 2.6, 4. , 1.2],
[5. , 2.3, 3.3, 1. ],
[5.6, 2.7, 4.2, 1.3],
[5.7, 3. , 4.2, 1.2],
[5.7, 2.9, 4.2, 1.3],
[6.2, 2.9, 4.3, 1.3],
[5.1, 2.5, 3. , 1.1],
[5.7, 2.8, 4.1, 1.3],
[6.3, 3.3, 6. , 2.5],
[5.8, 2.7, 5.1, 1.9],
[7.1, 3. , 5.9, 2.1],
[6.3, 2.9, 5.6, 1.8],
[6.5, 3. , 5.8, 2.2],
[7.6, 3. , 6.6, 2.1],
[4.9, 2.5, 4.5, 1.7],
[7.3, 2.9, 6.3, 1.8],
[6.7, 2.5, 5.8, 1.8],
[7.2, 3.6, 6.1, 2.5],
[6.5, 3.2, 5.1, 2. ],
[6.4, 2.7, 5.3, 1.9],
[6.8, 3. , 5.5, 2.1],
[5.7, 2.5, 5. , 2. ],
[5.8, 2.8, 5.1, 2.4],
[6.4, 3.2, 5.3, 2.3],
[6.5, 3. , 5.5, 1.8],
[7.7, 3.8, 6.7, 2.2],
[7.7, 2.6, 6.9, 2.3],
[6. , 2.2, 5. , 1.5],
[6.9, 3.2, 5.7, 2.3],
[5.6, 2.8, 4.9, 2. ],
[7.7, 2.8, 6.7, 2. ],
[6.3, 2.7, 4.9, 1.8],
[6.7, 3.3, 5.7, 2.1],
[7.2, 3.2, 6. , 1.8],
[6.2, 2.8, 4.8, 1.8],
[6.1, 3. , 4.9, 1.8],
[6.4, 2.8, 5.6, 2.1],
[7.2, 3. , 5.8, 1.6],
[7.4, 2.8, 6.1, 1.9],
[7.9, 3.8, 6.4, 2. ],
[6.4, 2.8, 5.6, 2.2],
[6.3, 2.8, 5.1, 1.5],
[6.1, 2.6, 5.6, 1.4],
[7.7, 3. , 6.1, 2.3],
[6.3, 3.4, 5.6, 2.4],
[6.4, 3.1, 5.5, 1.8],
[6. , 3. , 4.8, 1.8],
[6.9, 3.1, 5.4, 2.1],
[6.7, 3.1, 5.6, 2.4],
[6.9, 3.1, 5.1, 2.3],
[5.8, 2.7, 5.1, 1.9],
[6.8, 3.2, 5.9, 2.3],
[6.7, 3.3, 5.7, 2.5],
[6.7, 3. , 5.2, 2.3],
[6.3, 2.5, 5. , 1.9],
[6.5, 3. , 5.2, 2. ],
[6.2, 3.4, 5.4, 2.3],
[5.9, 3. , 5.1, 1.8]]), 'target': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]), 'frame': None, 'target_names': array(['setosa', 'versicolor', 'virginica'], dtype='<U10'), 'DESCR': '.. _iris_dataset:\n\nIris plants dataset\n--------------------\n\n**Data Set Characteristics:**\n\n :Number of Instances: 150 (50 in each of three classes)\n :Number of Attributes: 4 numeric, predictive attributes and the class\n :Attribute Information:\n - sepal length in cm\n - sepal width in cm\n - petal length in cm\n - petal width in cm\n - class:\n - Iris-Setosa\n - Iris-Versicolour\n - Iris-Virginica\n \n :Summary Statistics:\n\n ============== ==== ==== ======= ===== ====================\n Min Max Mean SD Class Correlation\n ============== ==== ==== ======= ===== ====================\n sepal length: 4.3 7.9 5.84 0.83 0.7826\n sepal width: 2.0 4.4 3.05 0.43 -0.4194\n petal length: 1.0 6.9 3.76 1.76 0.9490 (high!)\n petal width: 0.1 2.5 1.20 0.76 0.9565 (high!)\n ============== ==== ==== ======= ===== ====================\n\n :Missing Attribute Values: None\n :Class Distribution: 33.3% for each of 3 classes.\n :Creator: R.A. Fisher\n :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)\n :Date: July, 1988\n\nThe famous Iris database, first used by Sir R.A. Fisher. The dataset is taken\nfrom Fisher\'s paper. Note that it\'s the same as in R, but not as in the UCI\nMachine Learning Repository, which has two wrong data points.\n\nThis is perhaps the best known database to be found in the\npattern recognition literature. Fisher\'s paper is a classic in the field and\nis referenced frequently to this day. (See Duda & Hart, for example.) The\ndata set contains 3 classes of 50 instances each, where each class refers to a\ntype of iris plant. One class is linearly separable from the other 2; the\nlatter are NOT linearly separable from each other.\n\n.. topic:: References\n\n - Fisher, R.A. "The use of multiple measurements in taxonomic problems"\n Annual Eugenics, 7, Part II, 179-188 (1936); also in "Contributions to\n Mathematical Statistics" (John Wiley, NY, 1950).\n - Duda, R.O., & Hart, P.E. (1973) Pattern Classification and Scene Analysis.\n (Q327.D83) John Wiley & Sons. ISBN 0-471-22361-1. See page 218.\n - Dasarathy, B.V. (1980) "Nosing Around the Neighborhood: A New System\n Structure and Classification Rule for Recognition in Partially Exposed\n Environments". IEEE Transactions on Pattern Analysis and Machine\n Intelligence, Vol. PAMI-2, No. 1, 67-71.\n - Gates, G.W. (1972) "The Reduced Nearest Neighbor Rule". IEEE Transactions\n on Information Theory, May 1972, 431-433.\n - See also: 1988 MLC Proceedings, 54-64. Cheeseman et al"s AUTOCLASS II\n conceptual clustering system finds 3 classes in the data.\n - Many, many more ...', 'feature_names': ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)'], 'filename': 'iris.csv', 'data_module': 'sklearn.datasets.data'}
.. _iris_dataset:
Iris plants dataset
--------------------
**Data Set Characteristics:**
:Number of Instances: 150 (50 in each of three classes)
:Number of Attributes: 4 numeric, predictive attributes and the class
:Attribute Information:
- sepal length in cm
- sepal width in cm
- petal length in cm
- petal width in cm
- class:
- Iris-Setosa
- Iris-Versicolour
- Iris-Virginica
:Summary Statistics:
============== ==== ==== ======= ===== ====================
Min Max Mean SD Class Correlation
============== ==== ==== ======= ===== ====================
sepal length: 4.3 7.9 5.84 0.83 0.7826
sepal width: 2.0 4.4 3.05 0.43 -0.4194
petal length: 1.0 6.9 3.76 1.76 0.9490 (high!)
petal width: 0.1 2.5 1.20 0.76 0.9565 (high!)
============== ==== ==== ======= ===== ====================
:Missing Attribute Values: None
:Class Distribution: 33.3% for each of 3 classes.
:Creator: R.A. Fisher
:Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
:Date: July, 1988
The famous Iris database, first used by Sir R.A. Fisher. The dataset is taken
from Fisher's paper. Note that it's the same as in R, but not as in the UCI
Machine Learning Repository, which has two wrong data points.
This is perhaps the best known database to be found in the
pattern recognition literature. Fisher's paper is a classic in the field and
is referenced frequently to this day. (See Duda & Hart, for example.) The
data set contains 3 classes of 50 instances each, where each class refers to a
type of iris plant. One class is linearly separable from the other 2; the
latter are NOT linearly separable from each other.
.. topic:: References
- Fisher, R.A. "The use of multiple measurements in taxonomic problems"
Annual Eugenics, 7, Part II, 179-188 (1936); also in "Contributions to
Mathematical Statistics" (John Wiley, NY, 1950).
- Duda, R.O., & Hart, P.E. (1973) Pattern Classification and Scene Analysis.
(Q327.D83) John Wiley & Sons. ISBN 0-471-22361-1. See page 218.
- Dasarathy, B.V. (1980) "Nosing Around the Neighborhood: A New System
Structure and Classification Rule for Recognition in Partially Exposed
Environments". IEEE Transactions on Pattern Analysis and Machine
Intelligence, Vol. PAMI-2, No. 1, 67-71.
- Gates, G.W. (1972) "The Reduced Nearest Neighbor Rule". IEEE Transactions
on Information Theory, May 1972, 431-433.
- See also: 1988 MLC Proceedings, 54-64. Cheeseman et al"s AUTOCLASS II
conceptual clustering system finds 3 classes in the data.
- Many, many more ...
[[5.1 3.5 1.4 0.2]
[4.9 3. 1.4 0.2]
[4.7 3.2 1.3 0.2]
[4.6 3.1 1.5 0.2]
[5. 3.6 1.4 0.2]
[5.4 3.9 1.7 0.4]
[4.6 3.4 1.4 0.3]
[5. 3.4 1.5 0.2]
[4.4 2.9 1.4 0.2]
[4.9 3.1 1.5 0.1]
[5.4 3.7 1.5 0.2]
[4.8 3.4 1.6 0.2]
[4.8 3. 1.4 0.1]
[4.3 3. 1.1 0.1]
[5.8 4. 1.2 0.2]
[5.7 4.4 1.5 0.4]
[5.4 3.9 1.3 0.4]
[5.1 3.5 1.4 0.3]
[5.7 3.8 1.7 0.3]
[5.1 3.8 1.5 0.3]
[5.4 3.4 1.7 0.2]
[5.1 3.7 1.5 0.4]
[4.6 3.6 1. 0.2]
[5.1 3.3 1.7 0.5]
[4.8 3.4 1.9 0.2]
[5. 3. 1.6 0.2]
[5. 3.4 1.6 0.4]
[5.2 3.5 1.5 0.2]
[5.2 3.4 1.4 0.2]
[4.7 3.2 1.6 0.2]
[4.8 3.1 1.6 0.2]
[5.4 3.4 1.5 0.4]
[5.2 4.1 1.5 0.1]
[5.5 4.2 1.4 0.2]
[4.9 3.1 1.5 0.2]
[5. 3.2 1.2 0.2]
[5.5 3.5 1.3 0.2]
[4.9 3.6 1.4 0.1]
[4.4 3. 1.3 0.2]
[5.1 3.4 1.5 0.2]
[5. 3.5 1.3 0.3]
[4.5 2.3 1.3 0.3]
[4.4 3.2 1.3 0.2]
[5. 3.5 1.6 0.6]
[5.1 3.8 1.9 0.4]
[4.8 3. 1.4 0.3]
[5.1 3.8 1.6 0.2]
[4.6 3.2 1.4 0.2]
[5.3 3.7 1.5 0.2]
[5. 3.3 1.4 0.2]
[7. 3.2 4.7 1.4]
[6.4 3.2 4.5 1.5]
[6.9 3.1 4.9 1.5]
[5.5 2.3 4. 1.3]
[6.5 2.8 4.6 1.5]
[5.7 2.8 4.5 1.3]
[6.3 3.3 4.7 1.6]
[4.9 2.4 3.3 1. ]
[6.6 2.9 4.6 1.3]
[5.2 2.7 3.9 1.4]
[5. 2. 3.5 1. ]
[5.9 3. 4.2 1.5]
[6. 2.2 4. 1. ]
[6.1 2.9 4.7 1.4]
[5.6 2.9 3.6 1.3]
[6.7 3.1 4.4 1.4]
[5.6 3. 4.5 1.5]
[5.8 2.7 4.1 1. ]
[6.2 2.2 4.5 1.5]
[5.6 2.5 3.9 1.1]
[5.9 3.2 4.8 1.8]
[6.1 2.8 4. 1.3]
[6.3 2.5 4.9 1.5]
[6.1 2.8 4.7 1.2]
[6.4 2.9 4.3 1.3]
[6.6 3. 4.4 1.4]
[6.8 2.8 4.8 1.4]
[6.7 3. 5. 1.7]
[6. 2.9 4.5 1.5]
[5.7 2.6 3.5 1. ]
[5.5 2.4 3.8 1.1]
[5.5 2.4 3.7 1. ]
[5.8 2.7 3.9 1.2]
[6. 2.7 5.1 1.6]
[5.4 3. 4.5 1.5]
[6. 3.4 4.5 1.6]
[6.7 3.1 4.7 1.5]
[6.3 2.3 4.4 1.3]
[5.6 3. 4.1 1.3]
[5.5 2.5 4. 1.3]
[5.5 2.6 4.4 1.2]
[6.1 3. 4.6 1.4]
[5.8 2.6 4. 1.2]
[5. 2.3 3.3 1. ]
[5.6 2.7 4.2 1.3]
[5.7 3. 4.2 1.2]
[5.7 2.9 4.2 1.3]
[6.2 2.9 4.3 1.3]
[5.1 2.5 3. 1.1]
[5.7 2.8 4.1 1.3]
[6.3 3.3 6. 2.5]
[5.8 2.7 5.1 1.9]
[7.1 3. 5.9 2.1]
[6.3 2.9 5.6 1.8]
[6.5 3. 5.8 2.2]
[7.6 3. 6.6 2.1]
[4.9 2.5 4.5 1.7]
[7.3 2.9 6.3 1.8]
[6.7 2.5 5.8 1.8]
[7.2 3.6 6.1 2.5]
[6.5 3.2 5.1 2. ]
[6.4 2.7 5.3 1.9]
[6.8 3. 5.5 2.1]
[5.7 2.5 5. 2. ]
[5.8 2.8 5.1 2.4]
[6.4 3.2 5.3 2.3]
[6.5 3. 5.5 1.8]
[7.7 3.8 6.7 2.2]
[7.7 2.6 6.9 2.3]
[6. 2.2 5. 1.5]
[6.9 3.2 5.7 2.3]
[5.6 2.8 4.9 2. ]
[7.7 2.8 6.7 2. ]
[6.3 2.7 4.9 1.8]
[6.7 3.3 5.7 2.1]
[7.2 3.2 6. 1.8]
[6.2 2.8 4.8 1.8]
[6.1 3. 4.9 1.8]
[6.4 2.8 5.6 2.1]
[7.2 3. 5.8 1.6]
[7.4 2.8 6.1 1.9]
[7.9 3.8 6.4 2. ]
[6.4 2.8 5.6 2.2]
[6.3 2.8 5.1 1.5]
[6.1 2.6 5.6 1.4]
[7.7 3. 6.1 2.3]
[6.3 3.4 5.6 2.4]
[6.4 3.1 5.5 1.8]
[6. 3. 4.8 1.8]
[6.9 3.1 5.4 2.1]
[6.7 3.1 5.6 2.4]
[6.9 3.1 5.1 2.3]
[5.8 2.7 5.1 1.9]
[6.8 3.2 5.9 2.3]
[6.7 3.3 5.7 2.5]
[6.7 3. 5.2 2.3]
[6.3 2.5 5. 1.9]
[6.5 3. 5.2 2. ]
[6.2 3.4 5.4 2.3]
[5.9 3. 5.1 1.8]]
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
2 2]
['setosa' 'versicolor' 'virginica']
df.head()
|
sepal length(cm) |
sepal width(cm) |
petal length (cm) |
petal width (cm) |
0 |
5.1 |
3.5 |
1.4 |
0.2 |
1 |
4.9 |
3.0 |
1.4 |
0.2 |
2 |
4.7 |
3.2 |
1.3 |
0.2 |
3 |
4.6 |
3.1 |
1.5 |
0.2 |
4 |
5.0 |
3.6 |
1.4 |
0.2 |
df.tail()
|
sepal length(cm) |
sepal width(cm) |
petal length (cm) |
petal width (cm) |
145 |
6.7 |
3.0 |
5.2 |
2.3 |
146 |
6.3 |
2.5 |
5.0 |
1.9 |
147 |
6.5 |
3.0 |
5.2 |
2.0 |
148 |
6.2 |
3.4 |
5.4 |
2.3 |
149 |
5.9 |
3.0 |
5.1 |
1.8 |
df.index #인덱스 확인
RangeIndex(start=0, stop=150, step=1)
df.columns #컬럼 확인
Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
'petal width (cm)'],
dtype='object')
df.dtypes #형식 확인
sepal length (cm) float64
sepal width (cm) float64
petal length (cm) float64
petal width (cm) float64
dtype: object
df[['sepal length (cm)', 'sepal width (cm)']] = df[['sepal length (cm)', 'sepal width (cm)']].astype(object)
df.dtypes #형식 변경
sepal length (cm) object
sepal width (cm) object
petal length (cm) float64
petal width (cm) float64
dtype: object
df[['sepal length (cm)', 'sepal width (cm)']] = df[['sepal length (cm)', 'sepal width (cm)']].astype(float)
df.dtypes
sepal length (cm) float64
sepal width (cm) float64
petal length (cm) float64
petal width (cm) float64
dtype: object
df.info() #데이터 타입, 각 아이템 개수 확인
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 4 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 sepal length (cm) 150 non-null float64
1 sepal width (cm) 150 non-null float64
2 petal length (cm) 150 non-null float64
3 petal width (cm) 150 non-null float64
dtypes: float64(4)
memory usage: 4.8 KB
데이터 전처리
df = df.rename(columns={'sepal length (cm)': 'sepal length', 'sepal width (cm)': 'sepal width',
'petal length (cm)' : 'petal length', 'petal width (cm)': 'petal width',
'variety' : 'species'}) #변수 이름 변경
df.head()
|
sepal length |
sepal width |
petal length |
petal width |
0 |
5.1 |
3.5 |
1.4 |
0.2 |
1 |
4.9 |
3.0 |
1.4 |
0.2 |
2 |
4.7 |
3.2 |
1.3 |
0.2 |
3 |
4.6 |
3.1 |
1.5 |
0.2 |
4 |
5.0 |
3.6 |
1.4 |
0.2 |
Dataframe column 선택
- dataframe[ ] 으로 컬럼 추출
- [] -> Series로 변환
- [[]] -> dataframe으로 반환
df.columns
Index(['sepal length', 'sepal width', 'petal length', 'petal width'], dtype='object')
df['sepal length']
0 5.1
1 4.9
2 4.7
3 4.6
4 5.0
...
145 6.7
146 6.3
147 6.5
148 6.2
149 5.9
Name: sepal length, Length: 150, dtype: float64
df[['sepal length']]
|
sepal length |
0 |
5.1 |
1 |
4.9 |
2 |
4.7 |
3 |
4.6 |
4 |
5.0 |
... |
... |
145 |
6.7 |
146 |
6.3 |
147 |
6.5 |
148 |
6.2 |
149 |
5.9 |
150 rows × 1 columns
df[['sepal length', 'sepal width']]
|
sepal length |
sepal width |
0 |
5.1 |
3.5 |
1 |
4.9 |
3.0 |
2 |
4.7 |
3.2 |
3 |
4.6 |
3.1 |
4 |
5.0 |
3.6 |
... |
... |
... |
145 |
6.7 |
3.0 |
146 |
6.3 |
2.5 |
147 |
6.5 |
3.0 |
148 |
6.2 |
3.4 |
149 |
5.9 |
3.0 |
150 rows × 2 columns
dataframe row 선택
- dataframe의 경우 []연산자는 컬럼(column) 선택, 하지만 슬라이싱(slicing)은 행(row) 선택
- .loc(),iloc()로 행 선택 가능
- .loc() : 인덱스 자체를 사용
- .iloc() : 0 based 인덱스 사용
df.head(10)
|
sepal length |
sepal width |
petal length |
petal width |
0 |
5.1 |
3.5 |
1.4 |
0.2 |
1 |
4.9 |
3.0 |
1.4 |
0.2 |
2 |
4.7 |
3.2 |
1.3 |
0.2 |
3 |
4.6 |
3.1 |
1.5 |
0.2 |
4 |
5.0 |
3.6 |
1.4 |
0.2 |
5 |
5.4 |
3.9 |
1.7 |
0.4 |
6 |
4.6 |
3.4 |
1.4 |
0.3 |
7 |
5.0 |
3.4 |
1.5 |
0.2 |
8 |
4.4 |
2.9 |
1.4 |
0.2 |
9 |
4.9 |
3.1 |
1.5 |
0.1 |
df[0:5]
|
sepal length |
sepal width |
petal length |
petal width |
0 |
5.1 |
3.5 |
1.4 |
0.2 |
1 |
4.9 |
3.0 |
1.4 |
0.2 |
2 |
4.7 |
3.2 |
1.3 |
0.2 |
3 |
4.6 |
3.1 |
1.5 |
0.2 |
4 |
5.0 |
3.6 |
1.4 |
0.2 |
df.index = df.index + 100
df.head()
|
sepal length |
sepal width |
petal length |
petal width |
100 |
5.1 |
3.5 |
1.4 |
0.2 |
101 |
4.9 |
3.0 |
1.4 |
0.2 |
102 |
4.7 |
3.2 |
1.3 |
0.2 |
103 |
4.6 |
3.1 |
1.5 |
0.2 |
104 |
5.0 |
3.6 |
1.4 |
0.2 |
df.loc[[100]]
|
sepal length |
sepal width |
petal length |
petal width |
100 |
5.1 |
3.5 |
1.4 |
0.2 |
df.iloc[[30]]
|
sepal length |
sepal width |
petal length |
petal width |
130 |
4.8 |
3.1 |
1.6 |
0.2 |
df.iloc[[0]]
|
sepal length |
sepal width |
petal length |
petal width |
100 |
5.1 |
3.5 |
1.4 |
0.2 |
df.loc[[100,101,102],["sepal length", "sepal width"]]
|
sepal length |
sepal width |
100 |
5.1 |
3.5 |
101 |
4.9 |
3.0 |
102 |
4.7 |
3.2 |
df.iloc[[0,1,2],[0,3]]
|
sepal length |
petal width |
100 |
5.1 |
0.2 |
101 |
4.9 |
0.2 |
102 |
4.7 |
0.2 |